256 files changed, 6167 insertions, 2350 deletions
diff --git a/README.md b/README.md
index 4f6d1459dd..88c1ad1f94 100644
--- a/README.md
+++ b/README.md
@@ -3,9 +3,9 @@
 </div>
 -----------------
 
-|  **`Linux CPU`**   |  **`Linux GPU PIP`** | **`Mac OS CPU`** |  **`Android`** |
-|-------------------|----------------------|------------------|----------------|
-| [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-gpu_pip)](https://ci.tensorflow.org/job/tensorflow-master-gpu_pip) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) |
+| **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
+|-----------------|---------------------|------------------|-------------------|---------------|
+| [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) |
 
 **TensorFlow** is an open source software library for numerical computation using
 data flow graphs.  Nodes in the graph represent mathematical operations, while
@@ -33,10 +33,10 @@ and discussion.**
 
 People who are a little more adventurous can also try our nightly binaries:
 
-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-0.11.0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-0.11.0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-0.11.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
-* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
-* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-0.11.0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-0.11.0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
+* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.12.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.12.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.12.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
+* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-0.12.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-0.12.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-0.12.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
+* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.12.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.12.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
+* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-0.12.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-0.12.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
 * [Android](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/lastSuccessfulBuild/artifact/bazel-out/local_linux/bin/tensorflow/examples/android/tensorflow_demo.apk) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/))
 
 #### *Try your first TensorFlow program*
diff --git a/RELEASE.md b/RELEASE.md
index 8c7bda5b89..8c3af73a5b 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -6,9 +6,49 @@
   semantics. tf.div is renamed to tf.division. New operators tf.truncatediv and
   tf.truncatemod are available for achieving the previous C++ (truncation)
   division/modulus semantics.
+
+# Release 0.12.0
+
+## Major Features and Improvements
+
+* TensorFlow now builds and runs on Microsoft Windows (tested on Windows 10,
+  Windows 7, and Windows Server 2016). Supported languages include Python (via a
+  pip package) and C++. CUDA 8.0 and cuDNN 5.1 are supported for GPU
+  acceleration. Known limitations include: It is not currently possible to load
+  a custom op library. The GCS and HDFS file systems are not currently
+  supported. The following ops are not currently implemented:
+  DepthwiseConv2dNative, DepthwiseConv2dNativeBackpropFilter,
+  DepthwiseConv2dNativeBackpropInput, Dequantize, Digamma, Erf, Erfc, Igamma,
+  Igammac, Lgamma, Polygamma, QuantizeAndDequantize, QuantizedAvgPool,
+  QuantizedBatchNomWithGlobalNormalization, QuantizedBiasAdd, QuantizedConcat,
+  QuantizedConv2D, QuantizedMatmul, QuantizedMaxPool,
+  QuantizeDownAndShrinkRange, QuantizedRelu, QuantizedRelu6, QuantizedReshape,
+  QuantizeV2, RequantizationRange, and Requantize.
+* Go: Experimental API in Go to create and execute graphs
+  (https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go)
+* New checkpoint format becomes the default in `tf.train.Saver`. Old V1
+  checkpoints continue to be readable; controlled by the `write_version`
+  argument, `tf.train.Saver` now by default writes out in the new V2
+  format. It significantly reduces the peak memory required and latency
+  incurred during restore.
+* Added a new library for library of matrix-free (iterative) solvers for linear
+  equations, linear least-squares, eigenvalues and singular values in
+  tensorflow/contrib/solvers. Initial version has lanczos bidiagonalization,
+  conjugate gradients and CGLS.
+* Added gradients for `matrix_solve_ls` and `self_adjoint_eig`.
+* Large cleanup to add second order gradient for ops with C++ gradients and
+  improve existing gradients such that most ops can now be differentiated
+  multiple times.
+* Added a solver for ordinary differential equations,
+  `tf.contrib.integrate.odeint`.
+* New contrib module for tensors with named axes, `tf.contrib.labeled_tensor`.
+* Visualization of embeddings in TensorBoard.
+
+## Breaking Changes to the API
+
 * `BusAdjacency` enum replaced with a protocol buffer `DeviceLocality`.  PCI bus
-indexing now starts from 1 instead of 0, and bus_id==0 is used where previously
-BUS_ANY was used.
+  indexing now starts from 1 instead of 0, and bus_id==0 is used where
+  previously BUS_ANY was used.
 * `Env::FileExists` and `FileSystem::FileExists` now return a tensorflow::Status
   intead of a bool. Any callers to this function can be converted to a bool
   by adding .ok() to the call.
@@ -16,8 +56,9 @@ BUS_ANY was used.
   indicating its preferred use in language bindings for TensorFlow.
   What was previously `TF_Session` has been renamed to `TF_DeprecatedSession`.
 * Renamed TF_Port to TF_Output in the C API.
-* Removes RegisterShape from public API. Use C++ shape function registration
-  instead.
+* Removes RegisterShape from public API. Use C++ shape function registration instead.
+  indexing now starts from 1 instead of 0, and `bus_id==0` is used where
+  previously `BUS_ANY` was used.
 * Most RNN cells and RNN functions now use different variable scopes to be
   consistent with layers (`tf.contrib.layers`).  This means old checkpoints
   written using this code will not load after this change without providing
@@ -27,6 +68,65 @@ BUS_ANY was used.
 * Deprecated tf.select op. tf.where should be used instead.
 * `SparseTensor.shape` has been renamed to `SparseTensor.dense_shape`.  Same for
   `SparseTensorValue.shape`.
+* `Env::FileExists` and `FileSystem::FileExists` now return a
+  `tensorflow::Status` intead of a bool. Any callers to this function can be
+  converted to a bool by adding `.ok()` to the call.
+* C API: Type `TF_SessionWithGraph` has been renamed to `TF_Session`, indicating
+  its preferred use in language bindings for TensorFlow. What was previously
+  `TF_Session` has been renamed to `TF_DeprecatedSession`.
+* C API: Renamed `TF_Port` to `TF_Output`.
+* C API: The caller retains ownership of `TF_Tensor` objects provided to
+  `TF_Run`, `TF_SessionRun`, `TF_SetAttrTensor` etc.
+* Renamed `tf.image.per_image_whitening()` to
+  `tf.image.per_image_standardization()`
+* Move Summary protobuf constructors to `tf.summary` submodule.
+* Deprecate `histogram_summary`, `audio_summary`, `scalar_summary`,
+  `image_summary`, `merge_summary`, and `merge_all_summaries`.
+* Combined `batch_*` and regular version of linear algebra and FFT ops. The
+  regular op now handles batches as well. All `batch_*` Python interfaces were
+  removed.
+* `tf.all_variables`, `tf.VARIABLES` and `tf.initialize_all_variables` renamed
+  to `tf.global_variables`, `tf.GLOBAL_VARIABLES` and
+  `tf.global_variables_initializer` respectively.
+
+## Bug Fixes and Other Changes
+
+* Use threadsafe version of `lgamma` function.
+* Fix `tf.sqrt` handling of negative arguments.
+* Fixed bug causing incorrect number of threads to be used for multi-threaded
+  benchmarks.
+* Performance optimizations for `batch_matmul` on multi-core CPUs.
+* Improve trace, `matrix_set_diag`, `matrix_diag_part` and their gradients to
+  work for rectangular matrices.
+* Support for SVD of complex valued matrices.
+
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+@a7744hsc, Abhi Agg, @admcrae, Adriano Carmezim, Aki Sukegawa, Alex Kendall,
+Alexander Rosenberg Johansen, @amcrae, Amlan Kar, Andre Simpelo, Andreas Eberle,
+Andrew Hundt, Arnaud Lenglet, @b0noI, Balachander Ramachandran, Ben Barsdell,
+Ben Guidarelli, Benjamin Mularczyk, Burness Duan, @c0g, Changming Sun,
+@chanis, Corey Wharton, Dan J, Daniel Trebbien, Darren Garvey, David Brailovsky,
+David Jones, Di Zeng, @DjangoPeng, Dr. Kashif Rasul, @drag0, Fabrizio (Misto)
+Milo, FabríCio Ceschin, @fp, @Ghedeon, @guschmue, Gökçen Eraslan, Haosdent
+Huang, Haroen Viaene, Harold Cooper, Henrik Holst, @hoangmit, Ivan Ukhov, Javier
+Dehesa, Jingtian Peng, Jithin Odattu, Joan Pastor, Johan Mathe, Johannes Mayer,
+Jongwook Choi, Justus Schwabedal, Kai Wolf, Kamil Hryniewicz, Kamran Amini,
+Karen Brems, Karl Lattimer, @kborer, Ken Shirriff, Kevin Rose, Larissa Laich,
+Laurent Mazare, Leonard Lee, Liang-Chi Hsieh, Liangliang He, Luke Iwanski,
+Marek Kolodziej, Moustafa Alzantot, @MrQianjinsi, @nagachika, Neil Han, Nick
+Meehan, Niels Ole Salscheider, Nikhil Mishra, @nschuc, Ondrej Skopek, OndřEj
+Filip, @OscarDPan, Pablo Moyano, Przemyslaw Tredak, @qitaishui, @Quarazy,
+@raix852, Philipp Helo, Sam Abrahams, @SriramRamesh, Till Hoffmann, Tushar Soni,
+@tvn, @tyfkda, Uwe Schmidt, Victor Villas, Vit Stepanovs, Vladislav Gubarev,
+@wujingyue, Xuesong Yang, Yi Liu, Yilei Yang, @youyou3, Yuan (Terry) Tang,
+Yuming Wang, Zafar Takhirov, @zhongyuk, Ziming Dong, @guotong1988
+
+We are also grateful to all who filed issues or helped resolve them, asked and
+answered questions, and were part of inspiring discussions.
 
 # Release 0.11.0
 
diff --git a/configure b/configure
index dacaebb490..65a11ec582 100755
--- a/configure
+++ b/configure
@@ -52,7 +52,6 @@ done
 if is_windows; then
   TF_NEED_GCP=0
   TF_NEED_HDFS=0
-  TF_NEED_CUDA=0
   TF_NEED_OPENCL=0
 fi
 
@@ -80,10 +79,10 @@ if [ "$TF_NEED_GCP" == "1" ]; then
   fi
 
   # Update Bazel build configuration.
-  perl -pi -e "s,WITH_GCP_SUPPORT = (False|True),WITH_GCP_SUPPORT = True,s" tensorflow/core/platform/default/build_config.bzl
+  sed -i -e "s/WITH_GCP_SUPPORT = False/WITH_GCP_SUPPORT = True/" tensorflow/core/platform/default/build_config.bzl
 else
   # Update Bazel build configuration.
-  perl -pi -e "s,WITH_GCP_SUPPORT = (False|True),WITH_GCP_SUPPORT = False,s" tensorflow/core/platform/default/build_config.bzl
+  sed -i -e "s/WITH_GCP_SUPPORT = True/WITH_GCP_SUPPORT = False/" tensorflow/core/platform/default/build_config.bzl
 fi
 
 while [ "$TF_NEED_HDFS" == "" ]; do
@@ -102,10 +101,10 @@ done
 
 if [ "$TF_NEED_HDFS" == "1" ]; then
   # Update Bazel build configuration.
-  perl -pi -e "s,WITH_HDFS_SUPPORT = (False|True),WITH_HDFS_SUPPORT = True,s" tensorflow/core/platform/default/build_config.bzl
+  sed -i -e "s/WITH_HDFS_SUPPORT = False/WITH_HDFS_SUPPORT = True/" tensorflow/core/platform/default/build_config.bzl
 else
   # Update Bazel build configuration.
-  perl -pi -e "s,WITH_HDFS_SUPPORT = (False|True),WITH_HDFS_SUPPORT = False,s" tensorflow/core/platform/default/build_config.bzl
+  sed -i -e "s/WITH_HDFS_SUPPORT = True/WITH_HDFS_SUPPORT = False/" tensorflow/core/platform/default/build_config.bzl
 fi
 
 # Invoke python_config and set up symlinks to python includes
@@ -131,11 +130,11 @@ done
 ## Set up Cuda-related environment settings
 
 while [ "$TF_NEED_CUDA" == "" ]; do
-  read -p "Do you wish to build TensorFlow with GPU support? [y/N] " INPUT
+  read -p "Do you wish to build TensorFlow with CUDA support? [y/N] " INPUT
   case $INPUT in
-    [Yy]* ) echo "GPU support will be enabled for TensorFlow"; TF_NEED_CUDA=1;;
-    [Nn]* ) echo "No GPU support will be enabled for TensorFlow"; TF_NEED_CUDA=0;;
-    "" ) echo "No GPU support will be enabled for TensorFlow"; TF_NEED_CUDA=0;;
+    [Yy]* ) echo "CUDA support will be enabled for TensorFlow"; TF_NEED_CUDA=1;;
+    [Nn]* ) echo "No CUDA support will be enabled for TensorFlow"; TF_NEED_CUDA=0;;
+    "" ) echo "No CUDA support will be enabled for TensorFlow"; TF_NEED_CUDA=0;;
     * ) echo "Invalid selection: " $INPUT;;
   esac
 done
@@ -150,14 +149,15 @@ fi
 
 if [ "$TF_NEED_CUDA" == "1" ]; then
 # Set up which gcc nvcc should use as the host compiler
-while true; do
+# No need to set this on Windows
+while ! is_windows && true; do
   fromuser=""
   if [ -z "$GCC_HOST_COMPILER_PATH" ]; then
     default_gcc_host_compiler_path=$(which gcc || true)
     read -p "Please specify which gcc should be used by nvcc as the host compiler. [Default is $default_gcc_host_compiler_path]: " GCC_HOST_COMPILER_PATH
     fromuser="1"
     if [ -z "$GCC_HOST_COMPILER_PATH" ]; then
-      GCC_HOST_COMPILER_PATH=$default_gcc_host_compiler_path
+      GCC_HOST_COMPILER_PATH="$default_gcc_host_compiler_path"
     fi
   fi
   if [ -e "$GCC_HOST_COMPILER_PATH" ]; then
@@ -178,16 +178,23 @@ OSNAME=`uname -s`
 while true; do
   # Configure the Cuda SDK version to use.
   if [ -z "$TF_CUDA_VERSION" ]; then
-    read -p "Please specify the Cuda SDK version you want to use, e.g. 7.0. [Leave empty to use system default]: " TF_CUDA_VERSION
+    read -p "Please specify the CUDA SDK version you want to use, e.g. 7.0. [Leave empty to use system default]: " TF_CUDA_VERSION
   fi
 
   fromuser=""
   if [ -z "$CUDA_TOOLKIT_PATH" ]; then
     default_cuda_path=/usr/local/cuda
+    if is_windows; then
+      if [ -z "$CUDA_PATH" ]; then
+        default_cuda_path="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0"
+      else
+        default_cuda_path="$(cygpath -m "$CUDA_PATH")"
+      fi
+    fi
     read -p "Please specify the location where CUDA $TF_CUDA_VERSION toolkit is installed. Refer to README.md for more details. [Default is $default_cuda_path]: " CUDA_TOOLKIT_PATH
     fromuser="1"
     if [ -z "$CUDA_TOOLKIT_PATH" ]; then
-      CUDA_TOOLKIT_PATH=$default_cuda_path
+      CUDA_TOOLKIT_PATH="$default_cuda_path"
     fi
   fi
 
@@ -197,7 +204,9 @@ while true; do
     TF_CUDA_EXT=".$TF_CUDA_VERSION"
   fi
 
-  if [ "$OSNAME" == "Linux" ]; then
+  if is_windows; then
+    CUDA_RT_LIB_PATH="lib/x64/cudart.lib"
+  elif [ "$OSNAME" == "Linux" ]; then
     CUDA_RT_LIB_PATH="lib64/libcudart.so${TF_CUDA_EXT}"
   elif [ "$OSNAME" == "Darwin" ]; then
     CUDA_RT_LIB_PATH="lib/libcudart${TF_CUDA_EXT}.dylib"
@@ -235,14 +244,17 @@ while true; do
     fi
     # Result returned from "read" will be used unexpanded. That make "~" unuseable.
     # Going through one more level of expansion to handle that.
-    CUDNN_INSTALL_PATH=`${PYTHON_BIN_PATH} -c "import os; print(os.path.realpath(os.path.expanduser('${CUDNN_INSTALL_PATH}')))"`
+    CUDNN_INSTALL_PATH=`"${PYTHON_BIN_PATH}" -c "import os; print(os.path.realpath(os.path.expanduser('${CUDNN_INSTALL_PATH}')))"`
   fi
 
   if [[ -z "$TF_CUDNN_VERSION" ]]; then
     TF_CUDNN_EXT=""
     cudnn_lib_path=""
     cudnn_alt_lib_path=""
-    if [ "$OSNAME" == "Linux" ]; then
+    if is_windows; then
+      cudnn_lib_path="${CUDNN_INSTALL_PATH}/lib/x64/cudnn.lib"
+      cudnn_alt_lib_path="${CUDNN_INSTALL_PATH}/lib/x64/cudnn.lib"
+    elif [ "$OSNAME" == "Linux" ]; then
       cudnn_lib_path="${CUDNN_INSTALL_PATH}/lib64/libcudnn.so"
       cudnn_alt_lib_path="${CUDNN_INSTALL_PATH}/libcudnn.so"
     elif [ "$OSNAME" == "Darwin" ]; then
@@ -255,9 +267,9 @@ while true; do
     # If the path is not a symlink, readlink will exit with an error code, so
     # in that case, we return the path itself.
     if [ -f "$cudnn_lib_path" ]; then
-      REALVAL=`readlink ${cudnn_lib_path} || echo "${cudnn_lib_path}"`
+      REALVAL=`readlink "${cudnn_lib_path}" || echo "${cudnn_lib_path}"`
     else
-      REALVAL=`readlink ${cudnn_alt_lib_path} || echo "${cudnn_alt_lib_path}"`
+      REALVAL=`readlink "${cudnn_alt_lib_path}" || echo "${cudnn_alt_lib_path}"`
     fi
 
     # Extract the version of the SONAME, if it was indeed symlinked to
@@ -275,7 +287,10 @@ while true; do
     TF_CUDNN_EXT=".$TF_CUDNN_VERSION"
   fi
 
-  if [ "$OSNAME" == "Linux" ]; then
+  if is_windows; then
+    CUDA_DNN_LIB_PATH="lib/x64/cudnn.lib"
+    CUDA_DNN_LIB_ALT_PATH="lib/x64/cudnn.lib"
+  elif [ "$OSNAME" == "Linux" ]; then
     CUDA_DNN_LIB_PATH="lib64/libcudnn.so${TF_CUDNN_EXT}"
     CUDA_DNN_LIB_ALT_PATH="libcudnn.so${TF_CUDNN_EXT}"
   elif [ "$OSNAME" == "Darwin" ]; then
@@ -350,6 +365,16 @@ EOF
   TF_CUDA_COMPUTE_CAPABILITIES=""
 done
 
+if is_windows; then
+  # The following three variables are needed for MSVC toolchain configuration in Bazel
+  export CUDA_PATH="$CUDA_TOOLKIT_PATH"
+  export CUDA_COMPUTE_CAPABILITIES="$TF_CUDA_COMPUTE_CAPABILITIES"
+  export NO_WHOLE_ARCHIVE_OPTION=1
+
+  # Set GCC_HOST_COMPILER_PATH to keep cuda_configure.bzl happy
+  export GCC_HOST_COMPILER_PATH="/usr/bin/dummy_compiler"
+fi
+
 # end of if "$TF_NEED_CUDA" == "1"
 fi
 
diff --git a/grpc.BUILD b/grpc.BUILD
index ac34bfe521..e501db57e5 100644
--- a/grpc.BUILD
+++ b/grpc.BUILD
@@ -1561,6 +1561,7 @@ cc_library(
     "include/grpc++/impl/codegen/create_auth_context.h",
     "include/grpc++/impl/codegen/grpc_library.h",
     "include/grpc++/impl/codegen/method_handler_impl.h",
+    "include/grpc++/impl/codegen/proto_utils.h",
     "include/grpc++/impl/codegen/rpc_method.h",
     "include/grpc++/impl/codegen/rpc_service_method.h",
     "include/grpc++/impl/codegen/security/auth_context.h",
@@ -1766,10 +1767,12 @@ cc_library(
     "include/grpc++/impl/codegen/completion_queue.h",
     "include/grpc++/impl/codegen/completion_queue_tag.h",
     "include/grpc++/impl/codegen/config.h",
+    "include/grpc++/impl/codegen/config_protobuf.h",        
     "include/grpc++/impl/codegen/core_codegen_interface.h",
     "include/grpc++/impl/codegen/create_auth_context.h",
     "include/grpc++/impl/codegen/grpc_library.h",
     "include/grpc++/impl/codegen/method_handler_impl.h",
+    "include/grpc++/impl/codegen/proto_utils.h",
     "include/grpc++/impl/codegen/rpc_method.h",
     "include/grpc++/impl/codegen/rpc_service_method.h",
     "include/grpc++/impl/codegen/security/auth_context.h",
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index 4e37d0d585..70c9bd4e08 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -63,7 +63,8 @@ TEST_F(NNGradTest, ReluGrad) {
   auto y = Relu(scope_, x);
   // Avoid input values where ReLU gradient is not well defined (around zero).
   Tensor x_init_value = test::AsTensor<float>(
-      {-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9}, {5, 2});
+      {-0.9f, -0.7f, -0.5f, -0.3f, -0.1f, 0.1f, 0.3f, 0.5f, 0.7f, 0.9f},
+      {5, 2});
   RunTest(x, x_init_value, y, shape);
 }
 
@@ -74,7 +75,8 @@ TEST_F(NNGradTest, Relu6Grad) {
   // Avoid input values where ReLU gradient is not well defined (around zero
   // and six).
   Tensor x_init_value = test::AsTensor<float>(
-      {-0.9, -0.7, -0.5, -0.3, -0.1, 6.1, 6.3, 6.5, 6.7, 6.9}, {5, 2});
+      {-0.9f, -0.7f, -0.5f, -0.3f, -0.1f, 6.1f, 6.3f, 6.5f, 6.7f, 6.9f},
+      {5, 2});
   RunTest(x, x_init_value, y, shape);
 }
 
@@ -83,7 +85,8 @@ TEST_F(NNGradTest, EluGrad) {
   auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
   auto y = Elu(scope_, x);
   Tensor x_init_value = test::AsTensor<float>(
-      {-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9}, {5, 2});
+      {-0.9f, -0.7f, -0.5f, -0.3f, -0.1f, 0.1f, 0.3f, 0.5f, 0.7f, 0.9f},
+      {5, 2});
   RunTest(x, x_init_value, y, shape);
 }
 
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_tensor_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_tensor_test.py
index b73e87ce28..f0f2fab64f 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_tensor_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_tensor_test.py
@@ -229,3 +229,8 @@ class ObservedStochasticTensorTest(tf.test.TestCase):
         distributions.Normal(mu=mu, sigma=sigma),
         value=tf.zeros(
             (1, 2), dtype=tf.int32))
+
+
+if __name__ == "__main__":
+  tf.test.main()
+
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_variables_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_variables_test.py
index 1646abcd9f..74bf699d22 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_variables_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_variables_test.py
@@ -42,8 +42,8 @@ class StochasticVariablesTest(tf.test.TestCase):
 
     self.assertEqual(
         {"stochastic_variables/sv_mu", "stochastic_variables/sv_sigma"},
-        set([v.op.name for v in tf.all_variables()]))
-    self.assertEqual(set(tf.trainable_variables()), set(tf.all_variables()))
+        set([v.op.name for v in tf.global_variables()]))
+    self.assertEqual(set(tf.trainable_variables()), set(tf.global_variables()))
 
     v = tf.convert_to_tensor(v)
     self.assertEqual(list(shape), v.get_shape().as_list())
@@ -64,7 +64,7 @@ class StochasticVariablesTest(tf.test.TestCase):
             })):
       v = tf.get_variable("sv")
 
-    for var in tf.all_variables():
+    for var in tf.global_variables():
       if "mu" in var.name:
         mu_var = var
       if "sigma" in var.name:
@@ -96,7 +96,7 @@ class StochasticVariablesTest(tf.test.TestCase):
             })):
       v = tf.get_variable("sv", shape)
 
-    for var in tf.all_variables():
+    for var in tf.global_variables():
       if "mu" in var.name:
         mu_var = var
       if "sigma" in var.name:
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index b5246cb151..ec6be97151 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -1,5 +1,5 @@
 # Minimum CMake required
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)
 
 # Project
 project(tensorflow C CXX)
@@ -28,9 +28,11 @@ option(tensorflow_BUILD_CONTRIB_KERNELS "Build OpKernels from tensorflow/contrib
 option(tensorflow_BUILD_CC_TESTS "Build cc unit tests " OFF)
 option(tensorflow_BUILD_PYTHON_TESTS "Build python unit tests " OFF)
 
-#Threads: defines CMAKE_THREAD_LIBS_INIT and adds -pthread compile option for
-# targets that link ${CMAKE_THREAD_LIBS_INIT}.
-find_package (Threads)
+if (NOT WIN32)
+  # Threads: defines CMAKE_THREAD_LIBS_INIT and adds -pthread compile option
+  # for targets that link ${CMAKE_THREAD_LIBS_INIT}.
+  find_package (Threads)
+endif()
 
 # [CLEANUP] Remove when done
 # For debugging
@@ -55,6 +57,7 @@ if(WIN32)
   add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC -D__VERSION__=\"MSVC\")
   add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64 -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS)
   add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH -D_ITERATOR_DEBUG_LEVEL=0)
+  add_definitions(-DNDEBUG /O2)  # Equivalent of -c opt in Bazel.
   add_definitions(/bigobj /nologo /EHsc /GF /FC /MP /Gm-)
   # Suppress warnings to reduce build log size.
   add_definitions(/wd4267 /wd4244 /wd4800 /wd4503 /wd4554 /wd4996 /wd4348 /wd4018)
@@ -147,11 +150,11 @@ if (tensorflow_ENABLE_GPU)
 
     # by default we assume compute cabability 3.5 and 5.2. If you change this change it in
     # CUDA_NVCC_FLAGS and cuda_config.h below
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_35,code=\"sm_35,compute_35\";-gencode arch=compute_52,code=\"sm_52,compute_52\")
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_30,code=\"sm_30,compute_30\";-gencode arch=compute_35,code=\"sm_35,compute_35\";-gencode arch=compute_52,code=\"sm_52,compute_52\")
     set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--include-path ${PROJECT_BINARY_DIR}/$\{build_configuration\};--expt-relaxed-constexpr)
     set(CUDA_INCLUDE ${CUDA_TOOLKIT_TARGET_DIR} ${CUDA_TOOLKIT_TARGET_DIR}/extras/CUPTI/include)
     include_directories(${CUDA_INCLUDE})
-    add_definitions(-DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=3.5,5.2)
+    add_definitions(-DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=3.0,3.5,5.2)
 
     # add cudnn
     include_directories(${CUDNN_HOME})
@@ -161,7 +164,7 @@ if (tensorflow_ENABLE_GPU)
     FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
       "#ifndef CUDA_CUDA_CONFIG_H_\n"
       "#define CUDA_CUDA_CONFIG_H_\n"
-      "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.5\"),CudaVersion(\"5.2\")\n"
+      "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.0\"),CudaVersion(\"3.5\"),CudaVersion(\"5.2\")\n"
       "#define TF_CUDA_VERSION \"64_80\"\n"
       "#define TF_CUDNN_VERSION \"64_5\"\n"
       "#endif  // CUDA_CUDA_CONFIG_H_\n"
@@ -207,6 +210,7 @@ if(tensorflow_BUILD_CC_EXAMPLE)
   include(tf_label_image_example.cmake)
 endif()
 if(tensorflow_BUILD_PYTHON_BINDINGS)
+  include(tensorboard)
   include(tf_python.cmake)
 endif()
 if (tensorflow_BUILD_CC_TESTS OR tensorflow_BUILD_PYTHON_TESTS)
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index 3f8dcc525b..252a9e16bd 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -21,7 +21,7 @@ Note: Windows support is in an **alpha** state, and we welcome your feedback.
 
 ### Pre-requisites
 
-* CMake version 3.1 up to 3.6
+* CMake version 3.5 up to 3.6
 
 * [Git](http://git-scm.com)
 
diff --git a/tensorflow/contrib/cmake/external/tensorboard.cmake b/tensorflow/contrib/cmake/external/tensorboard.cmake
new file mode 100644
index 0000000000..dd2613a15c
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/tensorboard.cmake
@@ -0,0 +1,134 @@
+include (ExternalProject)
+
+set(tensorboard_dependencies)
+add_custom_target(tensorboard_copy_dependencies)
+
+function(tb_new_http_archive)
+  cmake_parse_arguments(_TB "" "NAME;URL" "FILES" ${ARGN})
+  ExternalProject_Add(${_TB_NAME}
+    PREFIX ${_TB_NAME}
+    URL ${_TB_URL}
+    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}/${_TB_NAME}"
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    INSTALL_COMMAND ""
+  )
+
+  set(src_dir "${CMAKE_CURRENT_BINARY_DIR}/${_TB_NAME}/src/${_TB_NAME}")
+  set(dst_dir "${CMAKE_CURRENT_BINARY_DIR}/tensorboard_external/${_TB_NAME}")
+
+  foreach(src_file ${_TB_FILES})
+    add_custom_command(
+      TARGET tensorboard_copy_dependencies PRE_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${src_dir}/${src_file} ${dst_dir}/${src_file}
+    )
+  endforeach()
+  
+  set(tensorboard_dependencies ${tensorboard_dependencies} ${_TB_NAME} PARENT_SCOPE)
+endfunction()
+
+function(tb_http_file)
+  cmake_parse_arguments(_TB "" "NAME;URL" "" ${ARGN})
+  get_filename_component(src_file ${_TB_URL} NAME)
+  file(DOWNLOAD ${_TB_URL} "${DOWNLOAD_LOCATION}/${_TB_NAME}/${src_file}")
+  
+  set(src_dir "${DOWNLOAD_LOCATION}/${_TB_NAME}")
+  set(dst_dir "${CMAKE_CURRENT_BINARY_DIR}/tensorboard_external/${_TB_NAME}/file")
+  
+  add_custom_command(
+    TARGET tensorboard_copy_dependencies PRE_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ${src_dir}/${src_file} ${dst_dir}/${src_file}
+  )
+  
+  add_custom_target(${_TB_NAME} DEPENDS ${src_dir}/${src_file})
+  set(tensorboard_dependencies ${tensorboard_dependencies} ${_TB_NAME} PARENT_SCOPE)
+endfunction()
+
+# Parse TensorBoard dependency names and URLs from Bazel's WORKSPACE file.
+set(tb_dep_names)
+file(STRINGS ${PROJECT_SOURCE_DIR}/../../../WORKSPACE workspace_contents)
+foreach(line ${workspace_contents})
+  if(line MATCHES "# TENSORBOARD_BOWER_AUTOGENERATED_BELOW_THIS_LINE_DO_NOT_EDIT")
+    set(tb_deps_started 1)
+  endif()
+
+  if(NOT tb_deps_started)
+    continue()
+  endif()
+
+  if(line MATCHES "new_http_archive\\(")
+    set(tb_dep_is_archive 1)
+    continue()
+  elseif(line MATCHES "http_file\\(")
+    set(tb_dep_is_archive 0)
+    continue()
+  endif()
+
+  string(REGEX MATCH "name.*=.*\"(.*)\"" has_name ${line})
+  if(has_name)
+    set(tb_dep_name ${CMAKE_MATCH_1})
+    continue()
+  endif()
+
+  string(REGEX MATCH "url.*=.*\"(.*)\"" has_url ${line})
+  if(has_url)
+    list(APPEND tb_dep_names ${tb_dep_name})
+    set(${tb_dep_name}_is_archive ${tb_dep_is_archive})
+    set(${tb_dep_name}_url ${CMAKE_MATCH_1})
+  endif()
+endforeach()
+
+# Parse the files needed for each TensorBoard dependency from Bazel's bower.BUILD file.
+# Due to CMAKE quirkiness, cannot use file(strings) with files that contain '[' and ']'.
+file(READ ${PROJECT_SOURCE_DIR}/../../../bower.BUILD bower_build_contents)
+string(REPLACE "\[" "OB" bower_build_contents "${bower_build_contents}")
+string(REPLACE "\]" "CB" bower_build_contents "${bower_build_contents}")
+string(REPLACE ";" "\\\\;" bower_build_contents "${bower_build_contents}")
+string(REPLACE "\n" "E;" bower_build_contents "${bower_build_contents}")
+foreach(line ${bower_build_contents})
+  string(REGEX MATCH "name.*=.*\"(.*)\"" has_name ${line})
+  if(has_name)
+    set(tb_dep_name ${CMAKE_MATCH_1})
+    set(${tb_dep_name}_files)
+    continue()
+  endif()
+
+  string(REGEX MATCH "srcs.*=.*\"(.*)\"CB" has_single_line_src ${line})
+  if(has_single_line_src)
+    list(APPEND ${tb_dep_name}_files ${CMAKE_MATCH_1})
+    continue()
+  endif()
+
+  if(line MATCHES "srcs.*=.*OB")
+    set(inside_files_def 1)
+    continue()
+  elseif(line MATCHES "CB,")
+    set(inside_files_def 0)
+    continue()
+  endif()
+
+  if(inside_files_def)
+   string(REGEX MATCH "\"(.*)\"," has_file ${line})
+   if(has_file)
+     list(APPEND ${tb_dep_name}_files ${CMAKE_MATCH_1})
+   endif()
+  endif()
+endforeach()
+
+# Generate a target for each dependency.
+foreach(tb_dep_name ${tb_dep_names})
+  if (${tb_dep_name}_is_archive)
+    tb_new_http_archive(
+      NAME ${tb_dep_name}
+      URL ${${tb_dep_name}_url}
+      FILES ${${tb_dep_name}_files}
+    )
+  else()
+    tb_http_file(
+      NAME ${tb_dep_name}
+      URL ${${tb_dep_name}_url}
+    )
+  endif()
+endforeach()
+
+add_dependencies(tensorboard_copy_dependencies ${tensorboard_dependencies})
diff --git a/tensorflow/contrib/cmake/setup.py b/tensorflow/contrib/cmake/setup.py
index b036016f5e..48adeb26e3 100644
--- a/tensorflow/contrib/cmake/setup.py
+++ b/tensorflow/contrib/cmake/setup.py
@@ -26,7 +26,7 @@ from setuptools import find_packages, setup, Command
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
 
-_VERSION = '0.11.0-cmake-experimental'
+_VERSION = '0.12.0-rc0-cmake-experimental'
 
 REQUIRED_PACKAGES = [
     'numpy >= 1.11.0',
diff --git a/tensorflow/contrib/cmake/tf_cc_ops.cmake b/tensorflow/contrib/cmake/tf_cc_ops.cmake
index 6d9c495574..b33f318ed3 100644
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@@ -84,3 +84,36 @@ add_library(tf_cc_ops OBJECT
     "${tensorflow_source_dir}/tensorflow/cc/ops/const_op.cc"
     "${tensorflow_source_dir}/tensorflow/cc/ops/standard_ops.h"
 )
+
+########################################################
+# tf_cc library
+########################################################
+file(GLOB_RECURSE tf_cc_srcs
+    "${tensorflow_source_dir}/tensorflow/cc/client/*.h"
+    "${tensorflow_source_dir}/tensorflow/cc/client/*.cc"
+    "${tensorflow_source_dir}/tensorflow/cc/gradients/*.h"
+    "${tensorflow_source_dir}/tensorflow/cc/gradients/*.cc"
+    "${tensorflow_source_dir}/tensorflow/cc/saved_model/*.h"
+    "${tensorflow_source_dir}/tensorflow/cc/saved_model/*.cc"
+    "${tensorflow_source_dir}/tensorflow/cc/training/*.h"
+    "${tensorflow_source_dir}/tensorflow/cc/training/*.cc"
+)
+
+set(tf_cc_srcs
+    ${tf_cc_srcs}
+    "${tensorflow_source_dir}/tensorflow/cc/framework/grad_op_registry.h"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/grad_op_registry.cc"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/gradient_checker.h"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/gradient_checker.cc"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/gradients.h"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/gradients.cc"
+)
+
+file(GLOB_RECURSE tf_cc_test_srcs
+    "${tensorflow_source_dir}/tensorflow/cc/*test*.cc"
+)
+
+list(REMOVE_ITEM tf_cc_srcs ${tf_cc_test_srcs})
+
+add_library(tf_cc OBJECT ${tf_cc_srcs})
+add_dependencies(tf_cc tf_cc_framework tf_cc_ops)
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index e903471f36..d5e02056c4 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -210,11 +210,7 @@ file(GLOB_RECURSE tf_core_framework_test_srcs
     "${tensorflow_source_dir}/tensorflow/core/util/*main.cc"
 )
 
-list(REMOVE_ITEM tf_core_framework_srcs ${tf_core_framework_test_srcs}
-    "${tensorflow_source_dir}/tensorflow/core/util/memmapped_file_system.cc"
-    "${tensorflow_source_dir}/tensorflow/core/util/memmapped_file_system.h"
-    "${tensorflow_source_dir}/tensorflow/core/util/memmapped_file_system_writer.cc"
-)
+list(REMOVE_ITEM tf_core_framework_srcs ${tf_core_framework_test_srcs})
 
 add_library(tf_core_framework OBJECT
     ${tf_core_framework_srcs}
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index 96554145f3..dac0406ba8 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -90,8 +90,6 @@ list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_exclude_srcs})
 if(WIN32)
   file(GLOB_RECURSE tf_core_kernels_windows_exclude_srcs
       # not working on windows yet
-      "${tensorflow_source_dir}/tensorflow/core/kernels/depthwise_conv_op.cc"  # Cannot find symbol: tensorflow::LaunchConv2DOp<struct Eigen::ThreadPoolDevice, double>::launch(...).
-      "${tensorflow_source_dir}/tensorflow/core/kernels/fact_op.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/meta_support.*"
       "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.h"
       "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.cc"
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index 5523023cb7..a9791cdeb7 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -9,6 +9,7 @@ set(tf_op_lib_names
     "io_ops"
     "linalg_ops"
     "logging_ops"
+    "losses"
     "math_ops"
     "nn_ops"
     "no_op"
@@ -17,6 +18,7 @@ set(tf_op_lib_names
     "resource_variable_ops"
     "script_ops"
     "sdca_ops"
+    "set_ops"  
     "sendrecv_ops"
     "sparse_ops"
     "state_ops"
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 072d01200e..ce305a4b1e 100644
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -68,7 +68,7 @@ function(RELATIVE_PROTOBUF_GENERATE_PYTHON ROOT_DIR SRCS)
     add_custom_command(
       OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/tf_python/${REL_DIR}/${FIL_WE}_pb2.py"
       COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-      ARGS --python_out  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/ -I ${ROOT_DIR} -I ${PROTOBUF_INCLUDE_DIRS} ${ABS_FIL} 
+      ARGS --python_out  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/ -I ${ROOT_DIR} -I ${PROTOBUF_INCLUDE_DIRS} ${ABS_FIL}
       DEPENDS ${PROTOBUF_PROTOC_EXECUTABLE} protobuf
       COMMENT "Running Python protocol buffer compiler on ${FIL}"
       VERBATIM )
@@ -118,12 +118,20 @@ RELATIVE_PROTOBUF_GENERATE_PYTHON(
     ${tensorflow_source_dir} PYTHON_PROTO_GENFILES ${tf_protos_python_srcs}
 )
 
+# NOTE(mrry): Avoid regenerating the tensorflow/core protos because this
+# can cause benign-but-failing-on-Windows-due-to-file-locking conflicts
+# when two rules attempt to generate the same file.
+file(GLOB_RECURSE tf_python_protos_cc_srcs RELATIVE ${tensorflow_source_dir}
+    "${tensorflow_source_dir}/tensorflow/python/*.proto"
+    "${tensorflow_source_dir}/tensorflow/contrib/session_bundle/*.proto"
+    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/*.proto"
+)
 RELATIVE_PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS
-    ${tensorflow_source_dir} ${tf_protos_python_srcs}
+    ${tensorflow_source_dir} ${tf_python_protos_cc_srcs}
 )
 
 add_library(tf_python_protos_cc ${PROTO_SRCS} ${PROTO_HDRS})
-
+add_dependencies(tf_python_protos_cc tf_protos_cc)
 
 # tf_python_touchup_modules adds empty __init__.py files to all
 # directories containing Python code, so that Python will recognize
@@ -141,7 +149,7 @@ function(add_python_module MODULE_NAME)
     set(options DONTCOPY)
     cmake_parse_arguments(ADD_PYTHON_MODULE "${options}" "" "" ${ARGN})
     add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
-        COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/tf_python/${MODULE_NAME}") 
+        COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/tf_python/${MODULE_NAME}")
     add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
         COMMAND ${CMAKE_COMMAND} -E touch "${CMAKE_CURRENT_BINARY_DIR}/tf_python/${MODULE_NAME}/__init__.py")
     file(GLOB module_python_srcs RELATIVE ${tensorflow_source_dir}
@@ -164,6 +172,9 @@ add_python_module("tensorflow/core/lib")
 add_python_module("tensorflow/core/lib/core")
 add_python_module("tensorflow/core/protobuf")
 add_python_module("tensorflow/core/util")
+add_python_module("tensorflow/examples")
+add_python_module("tensorflow/examples/tutorials")
+add_python_module("tensorflow/examples/tutorials/mnist")
 add_python_module("tensorflow/python")
 add_python_module("tensorflow/python/client")
 add_python_module("tensorflow/python/debug")
@@ -172,6 +183,7 @@ add_python_module("tensorflow/python/debug/examples")
 add_python_module("tensorflow/python/debug/wrappers")
 add_python_module("tensorflow/python/framework")
 add_python_module("tensorflow/python/kernel_tests")
+add_python_module("tensorflow/python/layers")
 add_python_module("tensorflow/python/lib")
 add_python_module("tensorflow/python/lib/core")
 add_python_module("tensorflow/python/lib/io")
@@ -179,6 +191,7 @@ add_python_module("tensorflow/python/ops")
 add_python_module("tensorflow/python/platform")
 add_python_module("tensorflow/python/platform/default")
 add_python_module("tensorflow/python/platform/summary")
+add_python_module("tensorflow/python/saved_model")
 add_python_module("tensorflow/python/summary")
 add_python_module("tensorflow/python/summary/impl")
 add_python_module("tensorflow/python/summary/writer")
@@ -187,8 +200,13 @@ add_python_module("tensorflow/python/training")
 add_python_module("tensorflow/python/user_ops")
 add_python_module("tensorflow/python/util")
 add_python_module("tensorflow/python/util/protobuf")
-
-add_python_module("tensorflow/contrib/")
+add_python_module("tensorflow/tensorboard")
+add_python_module("tensorflow/tensorboard/backend")
+add_python_module("tensorflow/tensorboard/lib/python")
+add_python_module("tensorflow/tensorboard/plugins")
+add_python_module("tensorflow/tensorboard/plugins/projector")
+add_python_module("tensorflow/tensorboard/scripts")
+add_python_module("tensorflow/contrib")
 add_python_module("tensorflow/contrib/android")
 add_python_module("tensorflow/contrib/android/java")
 add_python_module("tensorflow/contrib/android/java/org")
@@ -215,6 +233,7 @@ add_python_module("tensorflow/contrib/cudnn_rnn/ops")
 add_python_module("tensorflow/contrib/cudnn_rnn/python")
 add_python_module("tensorflow/contrib/cudnn_rnn/python/kernel_tests")
 add_python_module("tensorflow/contrib/cudnn_rnn/python/ops")
+add_python_module("tensorflow/contrib/deprecated")
 add_python_module("tensorflow/contrib/distributions")
 add_python_module("tensorflow/contrib/distributions/python")
 add_python_module("tensorflow/contrib/distributions/python/kernel_tests")
@@ -256,6 +275,9 @@ add_python_module("tensorflow/contrib/ios_examples/camera/en.lproj")
 add_python_module("tensorflow/contrib/ios_examples/simple")
 add_python_module("tensorflow/contrib/ios_examples/simple/data")
 add_python_module("tensorflow/contrib/ios_examples/simple/tf_ios_makefile_example.xcodeproj")
+add_python_module("tensorflow/contrib/labeled_tensor")
+add_python_module("tensorflow/contrib/labeled_tensor/python")
+add_python_module("tensorflow/contrib/labeled_tensor/python/ops")
 add_python_module("tensorflow/contrib/layers")
 add_python_module("tensorflow/contrib/layers/kernels")
 add_python_module("tensorflow/contrib/layers/ops")
@@ -279,6 +301,10 @@ add_python_module("tensorflow/contrib/learn/python/learn/preprocessing/tests")
 add_python_module("tensorflow/contrib/learn/python/learn/tests")
 add_python_module("tensorflow/contrib/learn/python/learn/tests/dataframe")
 add_python_module("tensorflow/contrib/learn/python/learn/utils")
+add_python_module("tensorflow/contrib/linalg")
+add_python_module("tensorflow/contrib/linalg/python")
+add_python_module("tensorflow/contrib/linalg/python/ops")
+add_python_module("tensorflow/contrib/linalg/python/kernel_tests")
 add_python_module("tensorflow/contrib/linear_optimizer")
 add_python_module("tensorflow/contrib/linear_optimizer/kernels")
 add_python_module("tensorflow/contrib/linear_optimizer/kernels/g3doc")
@@ -329,8 +355,12 @@ add_python_module("tensorflow/contrib/slim/python")
 add_python_module("tensorflow/contrib/slim/python/slim")
 add_python_module("tensorflow/contrib/slim/python/slim/data")
 add_python_module("tensorflow/contrib/slim/python/slim/nets")
+add_python_module("tensorflow/contrib/solvers")
+add_python_module("tensorflow/contrib/solvers/python")
+add_python_module("tensorflow/contrib/solvers/python/ops")
 add_python_module("tensorflow/contrib/specs")
 add_python_module("tensorflow/contrib/specs/python")
+add_python_module("tensorflow/contrib/stat_summarizer")
 add_python_module("tensorflow/contrib/tensorboard")
 add_python_module("tensorflow/contrib/tensorboard/plugins")
 add_python_module("tensorflow/contrib/tensorboard/plugins/projector")
@@ -350,10 +380,6 @@ add_python_module("tensorflow/contrib/tensor_forest/hybrid/python/ops")
 add_python_module("tensorflow/contrib/tensor_forest/python")
 add_python_module("tensorflow/contrib/tensor_forest/python/kernel_tests")
 add_python_module("tensorflow/contrib/tensor_forest/python/ops")
-add_python_module("tensorflow/contrib/tensorboard")
-add_python_module("tensorflow/contrib/tensorboard")
-add_python_module("tensorflow/contrib/tensorboard/plugins")
-add_python_module("tensorflow/contrib/tensorboard/plugins/projector")
 add_python_module("tensorflow/contrib/testing")
 add_python_module("tensorflow/contrib/testing/python")
 add_python_module("tensorflow/contrib/testing/python/framework")
@@ -366,6 +392,12 @@ add_python_module("tensorflow/contrib/training/python")
 add_python_module("tensorflow/contrib/training/python/training")
 add_python_module("tensorflow/contrib/util")
 
+# Additional directories with no Python sources.
+add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/dist")
+add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/lib/css")
+
 
 ########################################################
 # tf_python_op_gen_main library
@@ -394,7 +426,7 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name)
     set(oneValueArgs DESTINATION)
     set(multiValueArgs ADDITIONAL_LIBRARIES)
     cmake_parse_arguments(GENERATE_PYTHON_OP_LIB
-      "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+      "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     if(NOT DEFINED GENERATE_PYTHON_OP_LIB_DESTINATION)
       # Default destination is tf_python/tensorflow/python/ops/gen_<...>.py.
       set(GENERATE_PYTHON_OP_LIB_DESTINATION
@@ -424,7 +456,7 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name)
       COMMAND ${tf_python_op_lib_name}_gen_python @${tensorflow_source_dir}/tensorflow/python/ops/hidden_ops.txt 1 > ${GENERATE_PYTHON_OP_LIB_DESTINATION}
       DEPENDS ${tf_python_op_lib_name}_gen_python
     )
-    
+
     set(tf_python_ops_generated_files ${tf_python_ops_generated_files}
         ${GENERATE_PYTHON_OP_LIB_DESTINATION} PARENT_SCOPE)
 endfunction()
@@ -441,12 +473,14 @@ GENERATE_PYTHON_OP_LIB("image_ops")
 GENERATE_PYTHON_OP_LIB("io_ops")
 GENERATE_PYTHON_OP_LIB("linalg_ops")
 GENERATE_PYTHON_OP_LIB("logging_ops")
+GENERATE_PYTHON_OP_LIB("losses")
 GENERATE_PYTHON_OP_LIB("nn_ops")
 GENERATE_PYTHON_OP_LIB("parsing_ops")
 GENERATE_PYTHON_OP_LIB("random_ops")
 GENERATE_PYTHON_OP_LIB("resource_variable_ops")
 GENERATE_PYTHON_OP_LIB("script_ops")
 GENERATE_PYTHON_OP_LIB("sdca_ops")
+GENERATE_PYTHON_OP_LIB("set_ops")
 GENERATE_PYTHON_OP_LIB("state_ops")
 GENERATE_PYTHON_OP_LIB("sparse_ops")
 GENERATE_PYTHON_OP_LIB("string_ops")
@@ -473,10 +507,13 @@ add_dependencies(tf_python_ops tf_python_op_gen_main)
 
 find_package(SWIG REQUIRED)
 # Generate the C++ and Python source code for the SWIG wrapper.
+# NOTE(mrry): We always regenerate the SWIG wrapper, which means that we must
+# always re-link the Python extension, but we don't have to track the
+# individual headers on which the SWIG wrapper depends.
 add_custom_command(
       OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/pywrap_tensorflow.py"
              "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow.cc"
-      DEPENDS tf_python_touchup_modules
+      DEPENDS tf_python_touchup_modules __force_rebuild
       COMMAND ${SWIG_EXECUTABLE}
       ARGS -python -c++
            -I${tensorflow_source_dir}
@@ -535,7 +572,7 @@ target_link_libraries(pywrap_tensorflow
     ${tf_core_gpu_kernels_lib}
     ${tensorflow_EXTERNAL_LIBRARIES}
     tf_protos_cc
-		tf_python_protos_cc
+    tf_python_protos_cc
     ${PYTHON_LIBRARIES}
 )
 
@@ -545,11 +582,12 @@ target_link_libraries(pywrap_tensorflow
 add_custom_target(tf_python_build_pip_package)
 add_dependencies(tf_python_build_pip_package
     pywrap_tensorflow
+    tensorboard_copy_dependencies
     tf_python_copy_scripts_to_destination
     tf_python_touchup_modules
     tf_python_ops)
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/contrib/cmake/setup.py
+  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/setup.py
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
 if(WIN32)
   add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
@@ -566,6 +604,33 @@ add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/MANIFEST.in
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
+
+# Copy resources for TensorBoard.
+add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tensorboard/dist/bazel-html-imports.html
+                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/dist/)
+add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tensorboard/dist/index.html
+                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/dist/)
+add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tensorboard/dist/tf-tensorboard.html
+                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/dist/)
+add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tensorboard/lib/css/global.css
+                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/lib/css/)
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-  COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/tf_python/setup.py bdist_wheel
-  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tf_python)
+  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tensorboard/TAG
+                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/)
+add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_BINARY_DIR}/tensorboard_external
+                                             ${CMAKE_CURRENT_BINARY_DIR}/tf_python/external)
+
+if(${tensorflow_ENABLE_GPU})
+  add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+    COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/tf_python/setup.py bdist_wheel --project_name tensorflow_gpu
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tf_python)
+else()
+  add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+    COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/tf_python/setup.py bdist_wheel
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tf_python)
+endif(${tensorflow_ENABLE_GPU})
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index cca3e6c398..8608d3ff8f 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -70,17 +70,18 @@ function(AddTest)
   )
 
   foreach(datafile ${_AT_DATA})
+    file(RELATIVE_PATH datafile_rel ${tensorflow_source_dir} ${datafile})
     add_custom_command(
       TARGET ${_AT_TARGET} POST_BUILD
       COMMAND ${CMAKE_COMMAND} -E copy
-             "${CMAKE_CURRENT_SOURCE_DIR}/${datafile}"
-             "${testdir}/${datafile}"
-      DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${datafile}"
+             "${datafile}"
+             "${testdir}/${datafile_rel}"
+      DEPENDS "${datafile}"
     )
   endforeach()
 
   if (_AT_DEPENDS)
-    add_dependencies(${_AT_TARGET} ${_AT_DEPENDS})
+    add_dependencies(${_AT_TARGET} ${_AT_DEPENDS} googletest)
   endif()
 endfunction(AddTest)
 
@@ -98,7 +99,7 @@ function(AddPythonTests)
   endif(_AT_DEPENDS)
 
   foreach(sourcefile ${_AT_SOURCES})
-    add_test(NAME ${sourcefile} COMMAND ${PYTHON_EXECUTABLE} ${sourcefile})
+    add_test(NAME ${sourcefile} COMMAND ${PYTHON_EXECUTABLE} ${sourcefile} WORKING_DIRECTORY ${tensorflow_source_dir})
     if (_AT_DEPENDS)
       add_dependencies(${_AT_TARGET} ${_AT_DEPENDS})
     endif()
@@ -116,6 +117,9 @@ if (tensorflow_BUILD_PYTHON_TESTS)
   # include all test
   file(GLOB_RECURSE tf_test_src_py
     "${tensorflow_source_dir}/tensorflow/python/kernel_tests/*.py"
+    "${tensorflow_source_dir}/tensorflow/python/saved_model/*_test.py"
+    "${tensorflow_source_dir}/tensorflow/python/training/*_test.py"
+    "${tensorflow_source_dir}/tensorflow/tensorboard/*_test.py"
   )
 
   # exclude the onces we don't want
@@ -143,22 +147,21 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/cwise_ops_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/variable_scope_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/reshape_op_test.py"
+      "${tensorflow_source_dir}/tensorflow/tensorboard/backend/server_test.py"
       # int32/int64 mixup
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/functional_ops_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/py_func_test.py"
-      # issues related to windows fs
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/io_ops_test.py"
-      # missing kernel
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/conv_ops_test.py"
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/depthwise_conv_op_test.py"
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py"
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/pool_test.py"
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/qr_op_test.py"
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/svd_op_test.py"
       # cuda launch failed
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/diag_op_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/trace_op_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/one_hot_op_test.py" # gpu, T=uint8
+      # training tests
+      "${tensorflow_source_dir}/tensorflow/python/training/basic_session_run_hooks_test.py"  # Needs tf.contrib fix.
+      "${tensorflow_source_dir}/tensorflow/python/training/localhost_cluster_performance_test.py"  # Needs portpicker.
+      "${tensorflow_source_dir}/tensorflow/python/training/monitored_session_test.py"  # Needs tf.contrib fix.
+      "${tensorflow_source_dir}/tensorflow/python/training/saver_large_variable_test.py"  # Overflow error.
+      "${tensorflow_source_dir}/tensorflow/python/training/supervisor_test.py"  # Flaky I/O error on rename.
+      "${tensorflow_source_dir}/tensorflow/python/training/sync_replicas_optimizer_test.py"  # Needs portpicker.
     )
   endif()
   list(REMOVE_ITEM tf_test_src_py ${tf_test_src_py_exclude})
@@ -172,7 +175,7 @@ if (tensorflow_BUILD_CC_TESTS)
   #
   # cc unit tests. Be aware that by default we include 250+ tests which
   # will take time and space to build.
-  # If you wan to cut this down, for example to a specific test, modify
+  # If you want to cut this down, for example to a specific test, modify
   # tf_test_src_simple to your needs
   #
 
@@ -195,6 +198,18 @@ if (tensorflow_BUILD_CC_TESTS)
     "${tensorflow_source_dir}/tensorflow/c/tf_status_helper.cc"
   )
 
+  if(WIN32)
+     set(tf_src_testlib
+       ${tf_src_testlib}
+       "${tensorflow_source_dir}/tensorflow/core/platform/windows/test.cc"
+     )
+  else()
+     set(tf_src_testlib
+       ${tf_src_testlib}
+       "${tensorflow_source_dir}/tensorflow/core/platform/posix/test.cc"
+     )
+  endif()
+
   # include all test
   file(GLOB_RECURSE tf_test_src_simple
     "${tensorflow_source_dir}/tensorflow/cc/*_test.cc"
@@ -204,6 +219,15 @@ if (tensorflow_BUILD_CC_TESTS)
     "${tensorflow_source_dir}/tensorflow/contrib/rnn/*_test.cc"
   )
 
+  # exclude the ones we don't want
+  set(tf_test_src_simple_exclude
+    # generally not working
+    "${tensorflow_source_dir}/tensorflow/cc/client/client_session_test.cc"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/gradients_test.cc"
+    "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/call_options_test.cc"
+    "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/tensor_coding_test.cc"
+  )
+
   if (NOT tensorflow_ENABLE_GPU)
     # exclude gpu tests if we are not buildig for gpu
     set(tf_test_src_simple_exclude
@@ -217,15 +241,6 @@ if (tensorflow_BUILD_CC_TESTS)
     )
   endif()
 
-  # exclude the onces we don't want
-  set(tf_test_src_simple_exclude
-    # generally not working
-    "${tensorflow_source_dir}/tensorflow/cc/client/client_session_test.cc"
-    "${tensorflow_source_dir}/tensorflow/cc/framework/gradients_test.cc"
-    "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/call_options_test.cc"
-    "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/tensor_coding_test.cc"
-  )
-
   if (WIN32)
     set(tf_test_src_simple_exclude
       ${tf_test_src_simple_exclude}
@@ -235,68 +250,26 @@ if (tensorflow_BUILD_CC_TESTS)
 
       # TODO: test failing
       "${tensorflow_source_dir}/tensorflow/core/common_runtime/simple_placer_test.cc"
+      "${tensorflow_source_dir}/tensorflow/core/debug/debug_gateway_test.cc" # hangs
       "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/executor_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/quantized_reshape_op_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/requantization_range_op_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/requantize_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/restore_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/restore_v2_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/save_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/restore_op_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/quantize_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/lib/core/status_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/lib/strings/str_util_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/lib/strings/numbers_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/lib/monitoring/collection_registry_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/util/tensor_slice_reader_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/platform/file_system_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/platform/logging_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/platform/env_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/ops/math_grad_test.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/cudnn_rnn/cudnn_rnn_ops_test.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops_test.cc" # status 5
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/lstm_ops_test.cc" # status 5
 
-      # TODO: not compiling
-      "${tensorflow_source_dir}/tensorflow/cc/framework/gradient_checker_test.cc"
-      "${tensorflow_source_dir}/tensorflow/cc/gradients/math_grad_test.cc"
-      "${tensorflow_source_dir}/tensorflow/cc/gradients/array_grad_test.cc"
-      "${tensorflow_source_dir}/tensorflow/cc/saved_model/loader_test.cc"
-      "${tensorflow_source_dir}/tensorflow/cc/training/queue_runner_test.cc"
-      "${tensorflow_source_dir}/tensorflow/cc/training/coordinator_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/nn_ops_test.cc"
+      # TODO: not compiling 
       "${tensorflow_source_dir}/tensorflow/core/kernels/quantization_utils_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/activation_ops_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/batch_norm_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/bias_add_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/concat_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/conv_ops_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/matmul_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/pooling_ops_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/quantize_down_and_shrink_range_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/parameterized_truncated_normal_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/non_max_suppression_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/fused_batch_norm_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/hexagon/hexagon_graph_transferer_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/adjust_contrast_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/batch_norm_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/cast_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/colorspace_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/control_flow_ops_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/conv_ops_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/debug_ops_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/resize_bilinear_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/resize_nearest_neighbor_op_benchmark_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/spacetobatch_benchmark_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/sparse_add_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/summary_image_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/summary_op_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/quantized_activation_ops_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/quantized_bias_add_op_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/quantized_concat_op_test.cc"
@@ -312,10 +285,8 @@ if (tensorflow_BUILD_CC_TESTS)
       "${tensorflow_source_dir}/tensorflow/core/platform/cloud/retrying_file_system_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/platform/cloud/time_util_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/platform/port_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/platform/profile_utils/cpu_utils_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/common_runtime/direct_session_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc"
+      "${tensorflow_source_dir}/tensorflow/core/platform/subprocess_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/master_test.cc"
@@ -325,20 +296,9 @@ if (tensorflow_BUILD_CC_TESTS)
       "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/master_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/framework/partial_tensor_shape_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/lib/core/notification_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/lib/gtl/cleanup_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/lib/gtl/edit_distance_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/lib/strings/strcat_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/ops/array_grad_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/ops/nn_ops_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/example/example_parser_configuration_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/example/feature_util_test.cc"
       "${tensorflow_source_dir}/tensorflow/core/util/reporter_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/util/memmapped_file_system_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/util/sparse_sparse_tensor_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/debug/debug_gateway_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/debug/debug_io_utils_test.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/clustering_ops_test.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/session_bundle/bundle_shim_test.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/session_bundle/bundle_test.cc"
@@ -348,7 +308,15 @@ if (tensorflow_BUILD_CC_TESTS)
     )
   endif()
 
-  list(REMOVE_ITEM tf_test_src_simple ${tf_test_src_simple_exclude})
+  # Tests for saved_model require data, so need to treat them separately.
+  file(GLOB tf_cc_saved_model_test_srcs
+    "${tensorflow_source_dir}/tensorflow/cc/saved_model/*_test.cc"
+  )
+
+  list(REMOVE_ITEM tf_test_src_simple
+    ${tf_test_src_simple_exclude}
+    ${tf_cc_saved_model_test_srcs}
+  )
 
   set(tf_test_lib tf_test_lib)
   add_library(${tf_test_lib} STATIC ${tf_src_testlib})
@@ -360,6 +328,7 @@ if (tensorflow_BUILD_CC_TESTS)
     $<TARGET_OBJECTS:tf_core_cpu>
     $<TARGET_OBJECTS:tf_core_framework>
     $<TARGET_OBJECTS:tf_core_kernels>
+    $<TARGET_OBJECTS:tf_cc>
     $<TARGET_OBJECTS:tf_cc_framework>
     $<TARGET_OBJECTS:tf_cc_ops>
     $<TARGET_OBJECTS:tf_core_ops>
@@ -375,10 +344,23 @@ if (tensorflow_BUILD_CC_TESTS)
     ${tensorflow_EXTERNAL_LIBRARIES}
   )
 
+  # All tests that require no data.
   AddTests(
     SOURCES ${tf_test_src_simple}
     OBJECTS ${tf_obj_test}
     LIBS ${tf_test_libs}
-    DEPENDS googletest
   )
+
+  # Tests for tensorflow/cc/saved_model.
+  file(GLOB_RECURSE tf_cc_saved_model_test_data
+    "${tensorflow_source_dir}/tensorflow/cc/saved_model/testdata/*"
+  )
+
+  AddTests(
+    SOURCES ${tf_cc_saved_model_test_srcs}
+    DATA ${tf_cc_saved_model_test_data}
+    OBJECTS ${tf_obj_test}
+    LIBS ${tf_test_libs}
+  )
+
 endif(tensorflow_BUILD_CC_TESTS)
diff --git a/tensorflow/contrib/framework/python/ops/prettyprint_ops_test.py b/tensorflow/contrib/framework/python/ops/prettyprint_ops_test.py
index 5db9cef58b..dfaaafd88e 100644
--- a/tensorflow/contrib/framework/python/ops/prettyprint_ops_test.py
+++ b/tensorflow/contrib/framework/python/ops/prettyprint_ops_test.py
@@ -52,7 +52,7 @@ class PrettyPrintOpsTest(tf.test.TestCase):
     a = tf.Variable(1.0)
     a = tf.contrib.framework.print_op(a)
     with self.test_session():
-      tf.initialize_all_variables().run()
+      tf.global_variables_initializer().run()
       a.eval()
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/graph_editor/tests/transform_test.py b/tensorflow/contrib/graph_editor/tests/transform_test.py
index 6b26869236..9a06431320 100644
--- a/tensorflow/contrib/graph_editor/tests/transform_test.py
+++ b/tensorflow/contrib/graph_editor/tests/transform_test.py
@@ -58,6 +58,20 @@ class TransformTest(tf.test.TestCase):
       self.assertEqual(t.name, t_.name)
       self.assertEqual(info.original(t_), t)
 
+  def test_copy_assert(self):
+    tf.reset_default_graph()
+    a = tf.constant(1)
+    b = tf.constant(1)
+    eq = tf.equal(a, b)
+    assert_op = tf.Assert(eq, [a, b])
+    with tf.control_dependencies([assert_op]):
+      _ = tf.add(a, b)
+    sgv = ge.make_view([assert_op, eq.op, a.op, b.op])
+    copier = ge.Transformer()
+    copied_sgv, info = copier(sgv, sgv.graph, "", "")
+    new_assert_op = info.transformed(assert_op)
+    self.assertIsNotNone(new_assert_op)
+
   def test_transform(self):
     transformer = ge.Transformer()
     def my_transform_op_handler(info, op):
diff --git a/tensorflow/contrib/graph_editor/transform.py b/tensorflow/contrib/graph_editor/transform.py
index 26047437d7..11c19ccc22 100644
--- a/tensorflow/contrib/graph_editor/transform.py
+++ b/tensorflow/contrib/graph_editor/transform.py
@@ -446,10 +446,7 @@ class Transformer(object):
     # without any outputs. So the walk is now finalized from those roots.
     remaining_ops = [op for op in self._info.sgv.ops
                      if op not in self._info.transformed_ops]
-    remaining_roots = [
-        op for op in remaining_ops
-        if not op.outputs and not self._info.control_outputs.get(op)
-    ]
+    remaining_roots = [op for op in remaining_ops if not op.outputs]
     for op in remaining_roots:
       self._transform_op(op)
 
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
index 032ea57cf5..7f95e95c1b 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
@@ -1209,7 +1209,7 @@ class WeightedSumTest(tf.test.TestCase):
     logits, _, _ = tf.contrib.layers.weighted_sum_from_feature_columns(
         features, [hashed_sparse], num_outputs=5)
     with self.test_session():
-      tf.initialize_all_variables().run()
+      tf.global_variables_initializer().run()
       self.assertAllEqual(logits.eval().shape, [2, 5])
 
   def testWeightedSparseColumn(self):
@@ -1244,7 +1244,7 @@ class WeightedSumTest(tf.test.TestCase):
         features, [weighted_ids], num_outputs=5)
 
     with self.test_session():
-      tf.initialize_all_variables().run()
+      tf.global_variables_initializer().run()
       tf.initialize_all_tables().run()
       self.assertAllEqual(logits.eval().shape, [2, 5])
 
@@ -1846,7 +1846,7 @@ class WeightedSumTest(tf.test.TestCase):
                                                               [product],
                                                               num_outputs=1))
       with self.test_session() as sess:
-        tf.initialize_all_variables().run()
+        tf.global_variables_initializer().run()
         tf.initialize_all_tables().run()
         product_weights = column_to_variable[product][0]
         sess.run(product_weights.assign([[0.1], [0.2], [0.3], [0.4], [0.5]]))
@@ -1862,7 +1862,7 @@ class WeightedSumTest(tf.test.TestCase):
                                                               [product],
                                                               num_outputs=1))
       with self.test_session() as sess:
-        tf.initialize_all_variables().run()
+        tf.global_variables_initializer().run()
         tf.initialize_all_tables().run()
         product_weights = column_to_variable[product][0]
         sess.run(product_weights.assign([[0.1], [0.2], [0.3], [0.4], [0.5]]))
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index e13280e7df..cf0df3f095 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -171,7 +171,10 @@ def _fused_batch_norm(
       `batch_size`. The normalization is over all but the last dimension if
       `data_format` is `NHWC` and the second dimension if `data_format` is
       `NCHW`.
-    decay: decay for the moving average.
+    decay: decay for the moving average. Reasonable values for `decay` are close 
+      to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc. Lower 
+      `decay` value (recommend trying `decay`=0.9) if model experiences reasonably 
+      good training performance but poor validation and/or test performance.
     center: If True, subtract `beta`. If False, `beta` is ignored.
     scale: If True, multiply by `gamma`. If False, `gamma` is
       not used. When the next layer is linear (also e.g. `nn.relu`), this can be
@@ -396,7 +399,10 @@ def batch_norm(
       `batch_size`. The normalization is over all but the last dimension if
       `data_format` is `NHWC` and the second dimension if `data_format` is
       `NCHW`.
-    decay: decay for the moving average.
+    decay: decay for the moving average. Reasonable values for `decay` are close 
+      to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc. Lower 
+      `decay` value (recommend trying `decay`=0.9) if model experiences reasonably 
+      good training performance but poor validation and/or test performance.
     center: If True, subtract `beta`. If False, `beta` is ignored.
     scale: If True, multiply by `gamma`. If False, `gamma` is
       not used. When the next layer is linear (also e.g. `nn.relu`), this can be
@@ -1369,7 +1375,7 @@ def fully_connected(inputs,
   Raises:
     ValueError: if x has rank less than 2 or if its last dimension is not set.
   """
-  if not (isinstance(num_outputs, int) or isinstance(num_outputs, long)):
+  if not (isinstance(num_outputs, six.integer_types)):
     raise ValueError('num_outputs should be int or long, got %s.', num_outputs)
 
   layer_variable_getter = _build_variable_getter({'bias': 'biases'})
diff --git a/tensorflow/contrib/layers/python/layers/optimizers_test.py b/tensorflow/contrib/layers/python/layers/optimizers_test.py
index 1dfed82103..ab183ba75d 100644
--- a/tensorflow/contrib/layers/python/layers/optimizers_test.py
+++ b/tensorflow/contrib/layers/python/layers/optimizers_test.py
@@ -195,7 +195,7 @@ class OptimizersTest(tf.test.TestCase):
       self.assertAlmostEqual(var_value, 9.8916, 4)
       self.assertEqual(global_step_value, 1)
       var_count = 0
-      for var in tf.all_variables():
+      for var in tf.global_variables():
         if var.name.startswith("OptimizeLoss/AdaptiveMaxNorm"):
           var_count += 1
       self.assertEqual(2, var_count)
@@ -366,7 +366,7 @@ class AdaptiveClipping(tf.test.TestCase):
           decay=0.5)(grads_and_vars)
 
       var_dict = {}
-      for var in tf.all_variables():
+      for var in tf.global_variables():
         if var.name.startswith("AdaptiveMaxNorm"):
           var_dict[var.name.split(":")[0]] = var
       self.assertEqual(2, len(var_dict))
diff --git a/tensorflow/contrib/learn/python/learn/datasets/base.py b/tensorflow/contrib/learn/python/learn/datasets/base.py
index cdff6baf83..71978d4394 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/base.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/base.py
@@ -186,8 +186,8 @@ def _is_retriable(e):
 
 
 @retry(initial_delay=1.0, max_delay=16.0, is_retriable=_is_retriable)
-def urlretrieve_with_retry(url, filename):
-  urllib.request.urlretrieve(url, filename)
+def urlretrieve_with_retry(url, filename=None):
+  return urllib.request.urlretrieve(url, filename)
 
 
 def maybe_download(filename, work_directory, source_url):
@@ -205,11 +205,9 @@ def maybe_download(filename, work_directory, source_url):
     gfile.MakeDirs(work_directory)
   filepath = os.path.join(work_directory, filename)
   if not gfile.Exists(filepath):
-    with tempfile.NamedTemporaryFile() as tmpfile:
-      temp_file_name = tmpfile.name
-      urlretrieve_with_retry(source_url, temp_file_name)
-      gfile.Copy(temp_file_name, filepath)
-      with gfile.GFile(filepath) as f:
-        size = f.size()
-      print('Successfully downloaded', filename, size, 'bytes.')
+    temp_file_name, _ = urlretrieve_with_retry(source_url)
+    gfile.Copy(temp_file_name, filepath)
+    with gfile.GFile(filepath) as f:
+      size = f.size()
+    print('Successfully downloaded', filename, size, 'bytes.')
   return filepath
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 91d900395b..2ec5a0659a 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -330,8 +330,8 @@ class BaseEstimator(
 
     # Features and labels TensorSignature objects.
     # TODO(wicke): Rename these to something more descriptive
-    self._features_info = None
-    self._labels_info = None
+    self._features_info = {}
+    self._labels_info = {}
 
     self._graph = None
 
@@ -641,28 +641,29 @@ class BaseEstimator(
     return tensor_signature.create_example_parser_from_signatures(
         self._features_info, examples_batch)
 
-  def _check_inputs(self, features, labels):
-    if self._features_info is not None:
-      logging.debug('Given features: %s, required signatures: %s.',
-                    str(features), str(self._features_info))
-      if not tensor_signature.tensors_compatible(features, self._features_info):
-        raise ValueError('Features are incompatible with given information. '
+  def _check_inputs(self, features, labels, mode):
+    if mode in self._features_info:
+      logging.debug('Given features for mode %s: %s, required signatures: %s.',
+                    mode, str(features), str(self._features_info[mode]))
+
+      if not tensor_signature.tensors_compatible(features, self._features_info[mode]):
+        raise ValueError('Features for mode %s are incompatible with given information. '
                          'Given features: %s, required signatures: %s.' %
-                         (str(features), str(self._features_info)))
+                         (mode, str(features), str(self._features_info[mode])))
     else:
-      self._features_info = tensor_signature.create_signatures(features)
-      logging.debug('Setting feature info to %s.', str(self._features_info))
+      self._features_info[mode] = tensor_signature.create_signatures(features)
+      logging.debug('Setting feature info for mode %s to %s.', mode, str(self._features_info[mode]))
     if labels is not None:
-      if self._labels_info is not None:
+      if mode in self._labels_info:
         logging.debug('Given labels: %s, required signatures: %s.',
                       str(labels), str(self._labels_info))
-        if not tensor_signature.tensors_compatible(labels, self._labels_info):
-          raise ValueError('Labels are incompatible with given information. '
+        if not tensor_signature.tensors_compatible(labels, self._labels_info[mode]):
+          raise ValueError('Labels for mode %s are incompatible with given information. '
                            'Given labels: %s, required signatures: %s.' %
-                           (str(labels), str(self._labels_info)))
+                           (mode, str(labels), str(self._labels_info[mode])))
       else:
-        self._labels_info = tensor_signature.create_signatures(labels)
-        logging.debug('Setting labels info to %s', str(self._labels_info))
+        self._labels_info[mode] = tensor_signature.create_signatures(labels)
+        logging.debug('Setting labels info for mode %s to %s', mode, str(self._labels_info[mode]))
 
   def _train_model(self,
                    input_fn,
@@ -699,8 +700,7 @@ class BaseEstimator(
       random_seed.set_random_seed(self._config.tf_random_seed)
       global_step = contrib_framework.create_global_step(g)
       features, labels = input_fn()
-      self._check_inputs(features, labels)
-
+      self._check_inputs(features, labels, model_fn_lib.ModeKeys.TRAIN)
       # The default return type of _get_train_ops is ModelFnOps. But there are
       # some subclasses of tf.contrib.learn.Estimator which override this
       # method and use the legacy signature, namely _get_train_ops returns a
@@ -800,8 +800,7 @@ class BaseEstimator(
       random_seed.set_random_seed(self._config.tf_random_seed)
       global_step = contrib_framework.create_global_step(g)
       features, labels = input_fn()
-      self._check_inputs(features, labels)
-
+      self._check_inputs(features, labels, model_fn_lib.ModeKeys.EVAL)
       # The default return type of _get_eval_ops is ModelFnOps. But there are
       # some subclasses of tf.contrib.learn.Estimator which override this
       # method and use the legacy signature, namely _get_eval_ops returns an
@@ -835,6 +834,29 @@ class BaseEstimator(
       return result[0]
     return result
 
+  def _set_infer_mode_feature_signature(self, features):
+    for mode in list(self._features_info.keys()):
+      if tensor_signature.tensors_compatible(features, self._features_info[mode]):
+        self._features_info[model_fn_lib.ModeKeys.INFER] = self._features_info[mode]
+        if mode in self._labels_info:
+          self._labels_info[model_fn_lib.ModeKeys.INFER] = (
+              self._labels_info[mode])
+        else:
+          self._labels_info[model_fn_lib.ModeKeys.INFER] = None
+        break
+
+    if model_fn_lib.ModeKeys.INFER not in self._features_info:
+      logging.warning('Features for mode %s are incompatible with neither train mode nor eval mode.'
+                      ' Given features: %s' % (model_fn_lib.ModeKeys.INFER, str(features)))
+      for mode in list(self._features_info.keys()):
+        logging.warning('Whereas %s mode signatures: %s' % (mode, str(self._features_info[mode])))
+      self._check_inputs(features, None, model_fn_lib.ModeKeys.INFER)
+      if model_fn_lib.ModeKeys.TRAIN in self._labels_info:
+        logging.warning('Setting labels info for mode infer equal to that of labels info for train mode')
+        self._labels_info[model_fn_lib.ModeKeys.INFER] = self._labels_info[model_fn_lib.ModeKeys.TRAIN]
+      else:
+        self._labels_info[model_fn_lib.ModeKeys.INFER] = {}
+
   def _infer_model(
       self, input_fn, feed_fn=None, outputs=None, as_iterable=True):
     # Check that model has been trained.
@@ -1134,8 +1156,10 @@ class Estimator(BaseEstimator):
     Returns:
       `ModelFnOps` object.
     """
+
+    self._set_infer_mode_feature_signature(features)
     labels = tensor_signature.create_placeholders_from_signatures(
-        self._labels_info)
+        self._labels_info[model_fn_lib.ModeKeys.INFER])
     return self._call_model_fn(features, labels, model_fn_lib.ModeKeys.INFER)
 
   @experimental
@@ -1239,7 +1263,7 @@ class Estimator(BaseEstimator):
       return export_dir
 
 
-# For time of deprecation x,y from Estimator allow direct access.
+# For time of deprecation x,y from Estimator allow direct access
 # pylint: disable=protected-access
 class SKCompat(sklearn.BaseEstimator):
   """Scikit learn wrapper for TensorFlow Learn Estimator."""
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
index 5ebc299b57..3405005327 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
@@ -91,7 +91,18 @@ def boston_eval_fn():
                                                              0)
 
 
+def extract(data, key):
+  if isinstance(data, dict):
+    assert key in data
+    return data[key]
+  else:
+    return data
+
+
 def linear_model_params_fn(features, labels, mode, params):
+  features = extract(features, 'input')
+  labels = extract(labels, 'labels')
+
   assert mode in (
       tf.contrib.learn.ModeKeys.TRAIN,
       tf.contrib.learn.ModeKeys.EVAL,
@@ -106,6 +117,8 @@ def linear_model_params_fn(features, labels, mode, params):
 
 
 def linear_model_fn(features, labels, mode):
+  features = extract(features, 'input')
+  labels = extract(labels, 'labels')
   assert mode in (
       tf.contrib.learn.ModeKeys.TRAIN,
       tf.contrib.learn.ModeKeys.EVAL,
@@ -140,8 +153,8 @@ def linear_model_fn_with_model_fn_ops(features, labels, mode):
 
 
 def logistic_model_no_mode_fn(features, labels):
-  if isinstance(labels, dict):
-    labels = labels['labels']
+  features = extract(features, 'input')
+  labels = extract(labels, 'labels')
   labels = tf.one_hot(labels, 3, 1, 0)
   prediction, loss = (
       tf.contrib.learn.models.logistic_regression_zero_init(features, labels)
@@ -346,6 +359,34 @@ class EstimatorTest(tf.test.TestCase):
     with self.assertRaises(tf.contrib.learn.NotFittedError):
       est.predict(x=boston.data)
 
+  def testContinueTrainingDictionaryInput(self):
+    boston = tf.contrib.learn.datasets.load_boston()
+    output_dir = tempfile.mkdtemp()
+    est = tf.contrib.learn.Estimator(model_fn=linear_model_fn,
+                                     model_dir=output_dir)
+    boston_input = {'input': boston.data}
+    float64_target = {'labels': boston.target.astype(np.float64)}
+    est.fit(x=boston_input, y=float64_target, steps=50)
+    scores = est.evaluate(
+      x=boston_input,
+      y=float64_target,
+      metrics={'MSE': tf.contrib.metrics.streaming_mean_squared_error})
+    del est
+    # Create another estimator object with the same output dir.
+    est2 = tf.contrib.learn.Estimator(model_fn=linear_model_fn,
+                                      model_dir=output_dir)
+
+    # Check we can evaluate and predict.
+    scores2 = est2.evaluate(
+      x=boston_input,
+      y=float64_target,
+      metrics={'MSE': tf.contrib.metrics.streaming_mean_squared_error})
+    self.assertAllClose(scores2['MSE'],
+                        scores['MSE'])
+    predictions = np.array(list(est2.predict(x=boston_input)))
+    other_score = _sklearn.mean_squared_error(predictions, float64_target['labels'])
+    self.assertAllClose(other_score, scores['MSE'])
+
   def testContinueTraining(self):
     boston = tf.contrib.learn.datasets.load_boston()
     output_dir = tempfile.mkdtemp()
@@ -405,6 +446,22 @@ class EstimatorTest(tf.test.TestCase):
     self.assertTrue('global_step' in scores)
     self.assertEqual(100, scores['global_step'])
 
+  def testBostonAllDictionaryInput(self):
+    boston = tf.contrib.learn.datasets.load_boston()
+    est = tf.contrib.learn.Estimator(model_fn=linear_model_fn)
+    boston_input = {'input': boston.data}
+    float64_target = {'labels': boston.target.astype(np.float64)}
+    est.fit(x=boston_input, y=float64_target, steps=100)
+    scores = est.evaluate(
+      x=boston_input,
+      y=float64_target,
+      metrics={'MSE': tf.contrib.metrics.streaming_mean_squared_error})
+    predictions = np.array(list(est.predict(x=boston_input)))
+    other_score = _sklearn.mean_squared_error(predictions, boston.target)
+    self.assertAllClose(other_score, scores['MSE'])
+    self.assertTrue('global_step' in scores)
+    self.assertEqual(scores['global_step'], 100)
+
   def testIrisAll(self):
     iris = tf.contrib.learn.datasets.load_iris()
     est = tf.contrib.learn.SKCompat(
@@ -428,6 +485,31 @@ class EstimatorTest(tf.test.TestCase):
     self.assertTrue('global_step' in scores)
     self.assertEqual(100, scores['global_step'])
 
+  def testIrisAllDictionaryInput(self):
+    iris = tf.contrib.learn.datasets.load_iris()
+    est = tf.contrib.learn.Estimator(model_fn=logistic_model_no_mode_fn)
+    iris_data = {'input': iris.data}
+    iris_target = {'labels': iris.target}
+    est.fit(iris_data, iris_target, steps=100)
+    scores = est.evaluate(
+      x=iris_data,
+      y=iris_target,
+      metrics={('accuracy', 'class'): tf.contrib.metrics.streaming_accuracy})
+    predictions = list(est.predict(x=iris_data))
+    predictions_class = list(est.predict(x=iris_data, outputs=['class']))
+    self.assertEqual(len(predictions), iris.target.shape[0])
+    classes_batch = np.array([p['class'] for p in predictions])
+    self.assertAllClose(
+      classes_batch,
+      np.array([p['class'] for p in predictions_class]))
+    self.assertAllClose(
+      classes_batch,
+      np.argmax(np.array([p['prob'] for p in predictions]), axis=1))
+    other_score = _sklearn.accuracy_score(iris.target, classes_batch)
+    self.assertAllClose(other_score, scores['accuracy'])
+    self.assertTrue('global_step' in scores)
+    self.assertEqual(scores['global_step'], 100)
+
   def testIrisInputFn(self):
     iris = tf.contrib.learn.datasets.load_iris()
     est = tf.contrib.learn.Estimator(model_fn=logistic_model_no_mode_fn)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py
index 45e430717f..c4a257b8d4 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@@ -660,9 +660,10 @@ class LinearRegressor(evaluable.Evaluable, trainable.Trainable):
     """
     self._feature_columns = feature_columns
     assert self._feature_columns
-    self._optimizer = _get_default_optimizer(feature_columns)
     if optimizer:
       self._optimizer = _get_optimizer(optimizer)
+    else:
+      self._optimizer = _get_default_optimizer(feature_columns)
 
     chief_hook = None
     if (isinstance(optimizer, sdca_optimizer.SDCAOptimizer) and
diff --git a/tensorflow/contrib/learn/python/learn/evaluable.py b/tensorflow/contrib/learn/python/learn/evaluable.py
index 14cf5f01b8..aff0d70cd5 100644
--- a/tensorflow/contrib/learn/python/learn/evaluable.py
+++ b/tensorflow/contrib/learn/python/learn/evaluable.py
@@ -51,12 +51,14 @@ class Evaluable(object):
     for which this evaluation was performed.
 
     Args:
-      x: Matrix of shape [n_samples, n_features...] containing the input samples
-         for fitting the model. Can be iterator that returns arrays of features.
-         If set, `input_fn` must be `None`.
+      x: Matrix of shape [n_samples, n_features...] or dictionary of many matrices
+         containing the input samples for fitting the model. Can be iterator that returns
+         arrays of features or dictionary of array of features. If set, `input_fn` must
+         be `None`.
       y: Vector or matrix [n_samples] or [n_samples, n_outputs] containing the
          label values (class labels in classification, real numbers in
-         regression). Can be iterator that returns array of labels. If set,
+         regression) or dictionary of multiple vectors/matrices. Can be iterator
+         that returns array of targets or dictionary of array of targets. If set,
          `input_fn` must be `None`. Note: For classification, label values must
          be integers representing the class index (i.e. values from 0 to
          n_classes-1).
diff --git a/tensorflow/contrib/learn/python/learn/graph_actions.py b/tensorflow/contrib/learn/python/learn/graph_actions.py
index 5781d88bb8..55be25336e 100644
--- a/tensorflow/contrib/learn/python/learn/graph_actions.py
+++ b/tensorflow/contrib/learn/python/learn/graph_actions.py
@@ -299,10 +299,10 @@ def _monitored_train(graph,
       while not super_sess.should_stop():
         _, loss = super_sess.run([train_op, loss_op], feed_fn() if feed_fn else
                                  None)
+
     summary_io.SummaryWriterCache.clear()
     return loss
 
-
 # TODO(ispir): Deprecate train in favor of supervised_train
 def train(graph,
           output_dir,
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
index 2ce11e813f..f665ff7644 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
@@ -36,27 +36,49 @@ from tensorflow.python.platform import tf_logging as logging
 # pylint: disable=g-multiple-import,g-bad-import-order
 from .pandas_io import HAS_PANDAS, extract_pandas_data, extract_pandas_matrix, extract_pandas_labels
 from .dask_io import HAS_DASK, extract_dask_data, extract_dask_labels
+
+
 # pylint: enable=g-multiple-import,g-bad-import-order
 
 
 def _get_in_out_shape(x_shape, y_shape, n_classes, batch_size=None):
   """Returns shape for input and output of the data feeder."""
+  x_is_dict, y_is_dict = isinstance(x_shape, dict), y_shape is not None and isinstance(y_shape, dict)
+  if y_is_dict and n_classes is not None:
+    assert (isinstance(n_classes, dict))
+
   if batch_size is None:
-    batch_size = x_shape[0]
+    batch_size = list(x_shape.values())[0][0] if x_is_dict else x_shape[0]
   elif batch_size <= 0:
     raise ValueError('Invalid batch_size %d.' % batch_size)
-  x_shape = list(x_shape[1:]) if len(x_shape) > 1 else [1]
-  input_shape = [batch_size] + x_shape
+
+  if x_is_dict:
+    input_shape = {}
+    for k, v in list(x_shape.items()):
+      input_shape[k] = [batch_size] + (list(v[1:]) if len(v) > 1 else [1])
+  else:
+    x_shape = list(x_shape[1:]) if len(x_shape) > 1 else [1]
+    input_shape = [batch_size] + x_shape
+
   if y_shape is None:
     return input_shape, None, batch_size
-  y_shape = list(y_shape[1:]) if len(y_shape) > 1 else []
-  # Skip first dimension if it is 1.
-  if y_shape and y_shape[0] == 1:
-    y_shape = y_shape[1:]
-  if n_classes is not None and n_classes > 1:
-    output_shape = [batch_size] + y_shape + [n_classes]
+
+  def out_el_shape(out_shape, num_classes):
+    out_shape = list(out_shape[1:]) if len(out_shape) > 1 else []
+    # Skip first dimension if it is 1.
+    if out_shape and out_shape[0] == 1:
+      out_shape = out_shape[1:]
+    if num_classes is not None and num_classes > 1:
+      return [batch_size] + out_shape + [num_classes]
+    else:
+      return [batch_size] + out_shape
+
+  if not y_is_dict:
+    output_shape = out_el_shape(y_shape, n_classes)
   else:
-    output_shape = [batch_size] + y_shape
+    output_shape = dict([(k, out_el_shape(v, n_classes[k] if n_classes is not None and k in n_classes else None))
+                         for k, v in list(y_shape.items())])
+
   return input_shape, output_shape, batch_size
 
 
@@ -78,15 +100,18 @@ def _is_iterable(x):
 
 
 def setup_train_data_feeder(
-    x, y, n_classes, batch_size=None, shuffle=True, epochs=None):
+        x, y, n_classes, batch_size=None, shuffle=True, epochs=None):
   """Create data feeder, to sample inputs from dataset.
 
   If `x` and `y` are iterators, use `StreamingDataFeeder`.
 
   Args:
-    x: numpy, pandas or Dask matrix or iterable.
-    y: numpy, pandas or Dask array or iterable.
-    n_classes: number of classes.
+    x: numpy, pandas or Dask matrix or dictionary of aforementioned. Also
+      supports iterables.
+    y: numpy, pandas or Dask array or dictionary of aforementioned. Also supports
+      iterables.
+    n_classes: number of classes. Must be None or same type as y. In case, `y` is `dict`
+      (or iterable which returns dict) such that `n_classes[key] = n_classes for y[key]`
     batch_size: size to split data into parts. Must be >= 1.
     shuffle: Whether to shuffle the inputs.
     epochs: Number of epochs to run.
@@ -102,7 +127,7 @@ def setup_train_data_feeder(
     # pylint: disable=g-import-not-at-top
     import dask.dataframe as dd
     if (isinstance(x, (dd.Series, dd.DataFrame)) and
-        (y is None or isinstance(y, (dd.Series, dd.DataFrame)))):
+          (y is None or isinstance(y, (dd.Series, dd.DataFrame)))):
       data_feeder_cls = DaskDataFeeder
     else:
       data_feeder_cls = DataFeeder
@@ -115,31 +140,54 @@ def setup_train_data_feeder(
                        'streaming learning to work.')
     return StreamingDataFeeder(x, y, n_classes, batch_size)
   return data_feeder_cls(
-      x, y, n_classes, batch_size, shuffle=shuffle, epochs=epochs)
+    x, y, n_classes, batch_size, shuffle=shuffle, epochs=epochs)
 
 
 def _batch_data(x, batch_size=None):
   if (batch_size is not None) and (batch_size <= 0):
     raise ValueError('Invalid batch_size %d.' % batch_size)
-  chunk = []
+
+  x_first_el = six.next(x)
+  x = itertools.chain([x_first_el], x)
+
+  chunk = dict([(k, []) for k in list(x_first_el.keys())]) if isinstance(x_first_el, dict) else []
+  chunk_filled = False
   for data in x:
-    chunk.append(data)
-    if (batch_size is not None) and (len(chunk) >= batch_size):
-      yield np.matrix(chunk)
-      chunk = []
-  yield np.matrix(chunk)
+    if isinstance(data, dict):
+      for k, v in list(data.items()):
+        chunk[k].append(v)
+        if (batch_size is not None) and (len(chunk[k]) >= batch_size):
+          chunk[k] = np.matrix(chunk[k])
+          chunk_filled = True
+      if chunk_filled:
+        yield chunk
+        chunk = dict([(k, []) for k in list(x_first_el.keys())]) if isinstance(x_first_el, dict) else []
+        chunk_filled = False
+    else:
+      chunk.append(data)
+      if (batch_size is not None) and (len(chunk) >= batch_size):
+        yield np.matrix(chunk)
+        chunk = []
+
+  if isinstance(x_first_el, dict):
+    for k, v in list(data.items()):
+      chunk[k] = np.matrix(chunk[k])
+    yield chunk
+  else:
+    yield np.matrix(chunk)
 
 
 def setup_predict_data_feeder(x, batch_size=None):
   """Returns an iterable for feeding into predict step.
 
   Args:
-    x: numpy, pandas, Dask array or iterable.
-    batch_size: Size of batches to split data into.
-      If `None`, returns one batch of full size.
+    x: numpy, pandas, Dask array or dictionary of aforementioned. Also supports
+      iterable.
+    batch_size: Size of batches to split data into. If `None`, returns one
+      batch of full size.
 
   Returns:
-    List or iterator of parts of data to predict on.
+    List or iterator (or dictionary thereof) of parts of data to predict on.
 
   Raises:
     ValueError: if `batch_size` <= 0.
@@ -211,7 +259,7 @@ def _access(data, iloc):
 def _check_dtype(dtype):
   if dtypes.as_dtype(dtype) == dtypes.float64:
     logging.warn(
-        'float64 is not supported by many models, consider casting to float32.')
+      'float64 is not supported by many models, consider casting to float32.')
   return dtype
 
 
@@ -219,63 +267,85 @@ class DataFeeder(object):
   """Data feeder is an example class to sample data for TF trainer."""
 
   def __init__(
-      self, x, y, n_classes, batch_size=None, shuffle=True, random_state=None,
-      epochs=None):
+          self, x, y, n_classes, batch_size=None, shuffle=True, random_state=None,
+          epochs=None):
     """Initializes a DataFeeder instance.
 
     Args:
-      x: Feature Nd numpy matrix of shape `[n_samples, n_features, ...]`.
-      y: Label vector, either floats for regression or class id for
-        classification. If matrix, will consider as a sequence
-        of labels. Can be `None` for unsupervised setting.
+      x: One feature sample which can either Nd numpy matrix of shape
+        `[n_samples, n_features, ...]` or dictionary of Nd numpy matrix.
+      y: label vector, either floats for regression or class id for
+        classification. If matrix, will consider as a sequence of labels.
+        Can be `None` for unsupervised setting. Also supports dictionary of
+        labels.
       n_classes: Number of classes, 0 and 1 are considered regression, `None`
-        will pass through the input labels without one-hot conversion.
-      batch_size: Mini-batch size to accumulate.
+        will pass through the input labels without one-hot conversion. Also, if
+        `y` is `dict`, then `n_classes` must be `dict` such that
+        `n_classes[key] = n_classes for label y[key]`, `None` otherwise.
+      batch_size: Mini-batch size to accumulate samples in one mini batch.
       shuffle: Whether to shuffle `x`.
       random_state: Numpy `RandomState` object to reproduce sampling.
       epochs: Number of times to iterate over input data before raising
         `StopIteration` exception.
 
     Attributes:
-      x: Input features.
-      y: Input label.
+      x: Input features (ndarray or dictionary of ndarrays).
+      y: Input label (ndarray or dictionary of ndarrays).
       n_classes: Number of classes (if `None`, pass through indices without
         one-hot conversion).
       batch_size: Mini-batch size to accumulate.
-      input_shape: Shape of the input.
-      output_shape: Shape of the output.
-      input_dtype: DType of input.
-      output_dtype: DType of output.
+      input_shape: Shape of the input (or dictionary of shapes).
+      output_shape: Shape of the output (or dictionary of shapes).
+      input_dtype: DType of input (or dictionary of shapes).
+      output_dtype: DType of output (or dictionary of shapes.
     """
-    self._x = check_array(x, dtype=x.dtype)
-    # self.n_classes is None means we're passing in raw label indices.
-    y_dtype = (
-        np.int64 if n_classes is not None and n_classes > 1 else np.float32)
+    x_is_dict, y_is_dict = isinstance(x, dict), y is not None and isinstance(y, dict)
+    if isinstance(y, list):
+      y = np.array(y)
+
+    self._x = dict([(k, check_array(v, v.dtype)) for k, v in list(x.items())]) if x_is_dict else check_array(x, x.dtype)
+    self._y = None if y is None else \
+      dict([(k, check_array(v, v.dtype)) for k, v in list(y.items())]) if x_is_dict else check_array(y, y.dtype)
+
+    # self.n_classes is not None means we're converting raw target indices to one-hot.
     if n_classes is not None:
-      self._y = (None if y is None else check_array(y, dtype=y_dtype))
-    elif isinstance(y, list):
-      self._y = np.array(y)
-    else:
-      self._y = y
+      if not y_is_dict:
+        y_dtype = (np.int64 if n_classes is not None and n_classes > 1 else np.float32)
+        self._y = (None if y is None else check_array(y, dtype=y_dtype))
+
     self.n_classes = n_classes
     self.max_epochs = epochs
+
+    x_shape = dict([(k, v.shape) for k, v in list(self._x.items())]) if x_is_dict else self._x.shape
+    y_shape = dict(
+      [(k, v.shape) for k, v in list(self._y.items())]) if y_is_dict else None if y is None else self._y.shape
+
     self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape(
-        self._x.shape, None if self._y is None else self._y.shape, n_classes,
-        batch_size)
+      x_shape, y_shape, n_classes, batch_size)
+
     # Input dtype matches dtype of x.
-    self._input_dtype = _check_dtype(self._x.dtype)
-    # self.n_classes is None means we're passing in raw label indices
-    if n_classes is not None or self._y is None:
-      self._output_dtype = np.float32
-    else:
-      self._output_dtype = _check_dtype(self._y.dtype)
+    self._input_dtype = dict([(k, _check_dtype(v.dtype)) for k, v in list(self._x.items())]) if x_is_dict \
+      else _check_dtype(self._x.dtype)
+
+    # note: self._output_dtype = np.float32 when y is None
+    self._output_dtype = dict([(k, _check_dtype(v.dtype)) for k, v in list(self._y.items())]) if y_is_dict \
+      else _check_dtype(self._y.dtype) if y is not None else np.float32
+
+    # self.n_classes is None means we're passing in raw target indices
+    if n_classes is not None and y_is_dict:
+      for key in list(n_classes.keys()):
+        if key in self._output_dtype:
+          self._output_dtype[key] = np.float32
+
     self._shuffle = shuffle
     self.random_state = np.random.RandomState(
-        42) if random_state is None else random_state
+      42) if random_state is None else random_state
+
+    num_samples = list(self._x.values())[0].shape[0] if x_is_dict else self._x.shape[0]
     if self._shuffle:
-      self.indices = self.random_state.permutation(self._x.shape[0])
+      self.indices = self.random_state.permutation(num_samples)
     else:
-      self.indices = np.array(range(self._x.shape[0]))
+      self.indices = np.array(range(num_samples))
     self.offset = 0
     self.epoch = 0
     self._epoch_placeholder = None
@@ -320,19 +390,27 @@ class DataFeeder(object):
     Returns:
       Two placeholders for inputs and outputs.
     """
-    input_shape = [None] + self.input_shape[1:]
-    self._input_placeholder = array_ops.placeholder(
-        dtypes.as_dtype(self._input_dtype),
-        input_shape,
-        name='input')
-    if self.output_shape is None:
-      self._output_placeholder = None
-    else:
-      output_shape = [None] + self.output_shape[1:]
-      self._output_placeholder = array_ops.placeholder(
-          dtypes.as_dtype(self._output_dtype),
-          output_shape,
-          name='output')
+
+    def get_placeholder(shape, dtype, name_prepend):
+      if shape is None:
+        return None
+      if isinstance(shape, dict):
+        placeholder = {}
+        for key in list(shape.keys()):
+          placeholder[key] = array_ops.placeholder(
+            dtypes.as_dtype(dtype[key]),
+            [None] + shape[key][1:],
+            name=name_prepend + '_' + key
+          )
+      else:
+        placeholder = array_ops.placeholder(
+          dtypes.as_dtype(dtype),
+          [None] + shape[1:],
+          name=name_prepend)
+      return placeholder
+
+    self._input_placeholder = get_placeholder(self.input_shape, self._input_dtype, 'input')
+    self._output_placeholder = get_placeholder(self.output_shape, self._output_dtype, 'output')
     return self._input_placeholder, self._output_placeholder
 
   def set_placeholders(self, input_placeholder, output_placeholder):
@@ -342,21 +420,21 @@ class DataFeeder(object):
       input_placeholder: Placeholder for `x` variable. Should match shape
         of the examples in the x dataset.
       output_placeholder: Placeholder for `y` variable. Should match
-        shape of the examples in the y dataset. Can be None.
+        shape of the examples in the y dataset. Can be `None`.
     """
     self._input_placeholder = input_placeholder
     self._output_placeholder = output_placeholder
 
   def get_feed_params(self):
-    """Function returns a dict with data feed params while training.
+    """Function returns a `dict` with data feed params while training.
 
     Returns:
-      A dict with data feed params while training.
+      A `dict` with data feed params while training.
     """
     return {
-        'epoch': self.epoch,
-        'offset': self.offset,
-        'batch_size': self._batch_size
+      'epoch': self.epoch,
+      'offset': self.offset,
+      'batch_size': self._batch_size
     }
 
   def get_feed_dict_fn(self):
@@ -364,8 +442,35 @@ class DataFeeder(object):
 
     Returns:
       A function that when called samples a random subset of batch size
-      from x and y.
+      from `x` and `y`.
     """
+    x_is_dict, y_is_dict = isinstance(self._x, dict), self._y is not None and isinstance(self._y, dict)
+
+    # Assign input features from random indices.
+    def extract(data, indices):
+      return (np.array(_access(data, indices)).reshape((indices.shape[0], 1))
+              if len(data.shape) == 1 else _access(data, indices))
+
+    # assign labels from random indices
+    def assign_label(data, shape, dtype, n_classes, indices):
+      shape[0] = indices.shape[0]
+      out = np.zeros(shape, dtype=dtype)
+      for i in xrange(out.shape[0]):
+        sample = indices[i]
+        # self.n_classes is None means we're passing in raw target indices
+        if n_classes is None:
+          out[i] = _access(data, sample)
+        else:
+          if n_classes > 1:
+            if len(shape) == 2:
+              out.itemset((i, int(_access(data, sample))), 1.0)
+            else:
+              for idx, value in enumerate(_access(data, sample)):
+                out.itemset(tuple([i, idx, value]), 1.0)
+          else:
+            out[i] = _access(data, sample)
+      return out
+
     def _feed_dict_fn():
       """Function that samples data into given placeholders."""
       if self.max_epochs is not None and self.epoch + 1 > self.max_epochs:
@@ -376,20 +481,19 @@ class DataFeeder(object):
         feed_dict[self._epoch_placeholder.name] = [self.epoch]
 
       # Take next batch of indices.
-      end = min(self._x.shape[0], self.offset + self._batch_size)
+      x_len = list(self._x.values())[0].shape[0] if x_is_dict else self._x.shape[0]
+      end = min(x_len, self.offset + self._batch_size)
       batch_indices = self.indices[self.offset:end]
 
-      # Assign input features from random indices.
-      inp = (
-          np.array(_access(self._x, batch_indices)).reshape(
-              (batch_indices.shape[0], 1))
-          if len(self._x.shape) == 1 else _access(self._x, batch_indices))
-      feed_dict[self._input_placeholder.name] = inp
+      # adding input placeholder
+      feed_dict.update(
+        dict([(self._input_placeholder[k].name, extract(v, batch_indices)) for k, v in list(self._x.items())])
+        if x_is_dict else {self._input_placeholder.name: extract(self._x, batch_indices)})
 
       # move offset and reset it if necessary
       self.offset += self._batch_size
-      if self.offset >= self._x.shape[0]:
-        self.indices = self.random_state.permutation(self._x.shape[0])
+      if self.offset >= x_len:
+        self.indices = self.random_state.permutation(x_len) if self._shuffle else np.array(range(x_len))
         self.offset = 0
         self.epoch += 1
 
@@ -397,24 +501,18 @@ class DataFeeder(object):
       if self._output_placeholder is None:
         return feed_dict
 
-      # assign labels from random indices
-      self.output_shape[0] = batch_indices.shape[0]
-      out = np.zeros(self.output_shape, dtype=self._output_dtype)
-      for i in xrange(out.shape[0]):
-        sample = batch_indices[i]
-        # self.n_classes is None means we're passing in raw label indices
-        if self.n_classes is None:
-          out[i] = _access(self._y, sample)
-        else:
-          if self.n_classes > 1:
-            if len(self.output_shape) == 2:
-              out.itemset((i, int(_access(self._y, sample))), 1.0)
-            else:
-              for idx, value in enumerate(_access(self._y, sample)):
-                out.itemset(tuple([i, idx, value]), 1.0)
-          else:
-            out[i] = _access(self._y, sample)
-      feed_dict[self._output_placeholder.name] = out
+      # adding output placeholders
+      if y_is_dict:
+        for k, v in list(self._y.items()):
+          n_classes = (
+            self.n_classes[k] if k in self.n_classes else None) if self.n_classes is not None else None
+          shape, dtype = self.output_shape[k], self._output_dtype[k]
+          feed_dict.update(
+            {self._output_placeholder[k].name: assign_label(v, shape, dtype, n_classes, batch_indices)})
+      else:
+        shape, dtype, n_classes = self.output_shape, self._output_dtype, self.n_classes
+        feed_dict.update(
+          {self._output_placeholder.name: assign_label(self._y, shape, dtype, n_classes, batch_indices)})
 
       return feed_dict
 
@@ -433,21 +531,29 @@ class StreamingDataFeeder(DataFeeder):
     """Initializes a StreamingDataFeeder instance.
 
     Args:
-      x: iterator that returns for each element, returns features.
-      y: iterator that returns for each element, returns 1 or many classes /
-         regression values.
-      n_classes: indicator of how many classes the label has.
-      batch_size: Mini batch size to accumulate.
+      x: iterator each element of which returns one feature sample. Sample can
+        be a Nd numpy matrix or dictionary of Nd numpy matrices.
+      y: iterator each element of which returns one label sample. Sample can be
+        a Nd numpy matrix or dictionary of Nd numpy matrices with 1 or many
+        classes regression values.
+      n_classes: indicator of how many classes the corresponding label sample
+        has for the purposes of one-hot conversion of label. In case where `y`
+        is a dictionary, `n_classes` must be dictionary (with same keys as `y`)
+        of how many classes there are in each label in `y`. If key is
+        present in `y` and missing in `n_classes`, the value is assumed `None`
+        and no one-hot conversion will be applied to the label with that key.
+      batch_size: Mini batch size to accumulate samples in one batch. If set
+        `None`, then assumes that iterator to return already batched element.
 
     Attributes:
-      x: input features.
-      y: input label.
+      x: input features (or dictionary of input features).
+      y: input label (or dictionary of output features).
       n_classes: number of classes.
       batch_size: mini batch size to accumulate.
-      input_shape: shape of the input.
-      output_shape: shape of the output.
-      input_dtype: dtype of input.
-      output_dtype: dtype of output.
+      input_shape: shape of the input (can be dictionary depending on `x`).
+      output_shape: shape of the output (can be dictionary depending on `y`).
+      input_dtype: dtype of input (can be dictionary depending on `x`).
+      output_dtype: dtype of output (can be dictionary depending on `y`).
     """
     # pylint: disable=invalid-name,super-init-not-called
     x_first_el = six.next(x)
@@ -459,25 +565,48 @@ class StreamingDataFeeder(DataFeeder):
       y_first_el = None
       self._y = None
     self.n_classes = n_classes
-    x_first_el = ops.convert_to_tensor(x_first_el)
-    y_first_el = ops.convert_to_tensor(y_first_el) if y is not None else None
-    self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape(
-        [1] + list(x_first_el.get_shape()),
-        [1] + list(y_first_el.get_shape()) if y is not None else None,
-        n_classes,
-        batch_size)
-    self._input_dtype = _check_dtype(x_first_el.dtype).as_numpy_dtype
+
+    x_is_dict, y_is_dict = isinstance(x_first_el, dict), y is not None and isinstance(y_first_el, dict)
+    if y_is_dict and n_classes is not None:
+      assert (isinstance(n_classes, dict))
+
+    # extract shapes for first_elements
+    x_first_el_shape = dict([(k, [1] + list(v.shape)) for k, v in list(x_first_el.items())]) if x_is_dict \
+      else [1] + list(x_first_el.shape)
+
+    y_first_el_shape = dict([(k, [1] + list(v.shape)) for k, v in list(y_first_el.items())]) if y_is_dict \
+      else ([1] + list(y_first_el[0].shape if isinstance(y_first_el, list) else y_first_el.shape)
+            if y is not None else None)
+
+    self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape(x_first_el_shape, y_first_el_shape,
+                                                                              n_classes, batch_size)
+
+    # Input dtype of x_first_el.
+    self._input_dtype = dict([(k, _check_dtype(v.dtype)) for k, v in list(x_first_el.items())]) if x_is_dict \
+      else _check_dtype(x_first_el.dtype)
+
+    # Output dtype of y_first_el.
+    def check_y_dtype(el):
+      if isinstance(el, list) or isinstance(el, np.ndarray):
+        if isinstance(el, np.ndarray) and el.ndim == 0:
+          return el.dtype
+        else:
+          return _check_dtype(np.dtype(type(el[0])))
+      else:
+        return _check_dtype(np.dtype(type(el)))
+
     # Output types are floats, due to both softmaxes and regression req.
-    if n_classes is not None and n_classes > 0:
+    if n_classes is not None and (y is None or not y_is_dict) and n_classes > 0:
       self._output_dtype = np.float32
-    elif y is not None:
-      self._output_dtype = _check_dtype(y_first_el.dtype).as_numpy_dtype
+    else:
+      self._output_dtype = dict([(k, check_y_dtype(v)) for k, v in list(y_first_el.items())]) if y_is_dict \
+        else (check_y_dtype(y_first_el) if y is not None else None)
 
   def get_feed_params(self):
-    """Function returns a dict with data feed params while training.
+    """Function returns a `dict` with data feed params while training.
 
     Returns:
-      A dict with data feed params while training.
+      A `dict` with data feed params while training.
     """
     return {'batch_size': self._batch_size}
 
@@ -494,50 +623,76 @@ class StreamingDataFeeder(DataFeeder):
       """Samples data and provides it to placeholders.
 
       Returns:
-        Dict of input and output tensors.
+        `dict` of input and output tensors.
       """
+
+      def init_array(shape, dtype):
+        if shape is None:
+          return None
+        else:
+          return dict([(k, np.zeros(shape[k], dtype[k])) for k in list(shape.keys())]) if isinstance(shape, dict) else \
+            np.zeros(shape, dtype=dtype)
+
+      def put_data_array(dest, index, source=None, n_classes=None):
+        if source is None:
+          dest = dest[:index, :]
+        elif n_classes is not None and n_classes > 1:
+          if len(self.output_shape) == 2:
+            dest.itemset((index, source), 1.0)
+          else:
+            for idx, value in enumerate(source):
+              dest.itemset(tuple([index, idx, value]), 1.0)
+        else:
+          if len(dest.shape) > 1:
+            dest[index, :] = source
+          else:
+            dest[index] = source[0] if isinstance(source, list) else source
+        return dest
+
+      def put_data_array_or_dict(holder, index, data=None, n_classes=None):
+        if holder is None:
+          return None
+        if isinstance(holder, dict):
+          assert (isinstance(data, dict))
+          for k, v in list(holder.items()):
+            num_classes = n_classes[k] if (n_classes is not None and k in n_classes) else None
+            holder[k] = put_data_array(holder[k], index, data[k], num_classes)
+        else:
+          holder = put_data_array(holder, index, data, n_classes)
+        return holder
+
       if self.stopped:
         raise StopIteration
-      try:
-        inp = np.zeros(self.input_shape, dtype=self._input_dtype)
-      except TypeError as exc:
-        raise TypeError('Unrecognized dtype: {}. {}'.format(
-            self._input_dtype, exc))
-      if self._y is not None:
-        out = np.zeros(self.output_shape, dtype=self._output_dtype)
+
+      inp = init_array(self.input_shape, self._input_dtype)
+      out = init_array(self.output_shape, self._output_dtype)
+
       for i in xrange(self._batch_size):
         # Add handling when queue ends.
         try:
-          inp[i, :] = six.next(self._x)
+          next_inp = six.next(self._x)
+          inp = put_data_array_or_dict(inp, i, next_inp, None)
         except StopIteration:
           self.stopped = True
           if i == 0:
             raise
-          inp = inp[:i, :]
-          if self._y is not None:
-            out = out[:i]
+          inp = put_data_array_or_dict(inp, i, None, None)
+          out = put_data_array_or_dict(out, i, None, None)
           break
 
         if self._y is not None:
-          y = six.next(self._y)
-          if self.n_classes is not None and self.n_classes > 1:
-            if len(self.output_shape) == 2:
-              out.itemset((i, y), 1.0)
-            else:
-              for idx, value in enumerate(y):
-                out.itemset(tuple([i, idx, value]), 1.0)
-          else:
-            # The y itertor can sometimes return scalars or singleton lists.
-            try:
-              out[i] = y
-            except ValueError as _:
-              assert len(y) == 1, ('Expected singleton label, got {}'
-                                   .format(repr(y)))
-              out[i] = y[0]
-      if self._y is None:
-        return {self._input_placeholder.name: inp}
-      return {self._input_placeholder.name: inp,
-              self._output_placeholder.name: out}
+          next_out = six.next(self._y)
+          out = put_data_array_or_dict(out, i, next_out, self.n_classes)
+
+      # creating feed_dict
+      feed_dict = dict([(self._input_placeholder[k].name, inp[k]) for k in list(self._input_placeholder.keys())]) if \
+        isinstance(inp, dict) else {self._input_placeholder.name: inp}
+      if self._y is not None:
+        feed_dict.update(
+          dict([(self._output_placeholder[k].name, out[k]) for k in list(self._output_placeholder.keys())]) \
+            if isinstance(out, dict) else {self._output_placeholder.name: out})
+
+      return feed_dict
 
     return _feed_dict_fn
 
@@ -575,6 +730,10 @@ class DaskDataFeeder(object):
       input_dtype: dtype of input.
       output_dtype: dtype of output.
     """
+
+    if isinstance(x, dict) or isinstance(y, dict):
+      raise ValueError("DaskDataFeeder does not support dictionaries at the moment.")
+
     # pylint: disable=invalid-name,super-init-not-called
     import dask.dataframe as dd  # pylint: disable=g-import-not-at-top
     # TODO(terrytangyuan): check x and y dtypes in dask_io like pandas
@@ -601,7 +760,7 @@ class DaskDataFeeder(object):
     self._shuffle = shuffle
     self.epochs = epochs
     self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape(
-        x_shape, y_shape, n_classes, batch_size)
+      x_shape, y_shape, n_classes, batch_size)
     self.sample_fraction = self._batch_size / float(x_count)
     self._input_dtype = _check_dtype(self._x.dtypes[0])
     self._output_dtype = _check_dtype(self._y.dtypes[self._y_columns])
@@ -611,10 +770,10 @@ class DaskDataFeeder(object):
       self.random_state = random_state
 
   def get_feed_params(self):
-    """Function returns a dict with data feed params while training.
+    """Function returns a `dict` with data feed params while training.
 
     Returns:
-      A dict with data feed params while training.
+      A `dict` with data feed params while training.
     """
     return {'batch_size': self._batch_size}
 
@@ -629,13 +788,14 @@ class DaskDataFeeder(object):
       A function that when called samples a random subset of batch size
       from x and y.
     """
+
     def _feed_dict_fn():
       """Samples data and provides it to placeholders."""
       # TODO(ipolosukhin): option for with/without replacement (dev version of
       # dask)
       sample = self.df.random_split(
-          [self.sample_fraction, 1 - self.sample_fraction],
-          random_state=self.random_state)
+        [self.sample_fraction, 1 - self.sample_fraction],
+        random_state=self.random_state)
       inp = extract_pandas_matrix(sample[0][self._x_columns].compute()).tolist()
       out = extract_pandas_matrix(sample[0][self._y_columns].compute())
       # convert to correct dtype
@@ -650,4 +810,5 @@ class DaskDataFeeder(object):
       encoded_out[np.arange(out.size), out] = 1
       return {input_placeholder.name: inp,
               output_placeholder.name: encoded_out}
+
     return _feed_dict_fn
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py
index fe675e3122..828db45757 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py
@@ -32,150 +32,200 @@ class DataFeederTest(tf.test.TestCase):
   # pylint: disable=undefined-variable
   """Tests for `DataFeeder`."""
 
+  def _wrap_dict(self, data, prepend=''):
+    return {prepend+'1': data, prepend+'2': data}
+
   def _assert_raises(self, input_data):
     with self.assertRaisesRegexp(TypeError, 'annot convert'):
       data_feeder.DataFeeder(input_data, None, n_classes=0, batch_size=1)
 
   def test_input_uint32(self):
-    self._assert_raises(np.matrix([[1, 2], [3, 4]], dtype=np.uint32))
+    data = np.matrix([[1, 2], [3, 4]], dtype=np.uint32)
+    self._assert_raises(data)
+    self._assert_raises(self._wrap_dict(data))
 
   def test_input_uint64(self):
-    self._assert_raises(np.matrix([[1, 2], [3, 4]], dtype=np.uint64))
+    data = np.matrix([[1, 2], [3, 4]], dtype=np.uint64)
+    self._assert_raises(data)
+    self._assert_raises(self._wrap_dict(data))
 
   def _assert_dtype(self, expected_np_dtype, expected_tf_dtype, input_data):
     feeder = data_feeder.DataFeeder(input_data, None, n_classes=0, batch_size=1)
-    self.assertEqual(expected_np_dtype, feeder.input_dtype)
+    if isinstance(input_data, dict):
+      for k, v in list(feeder.input_dtype.items()):
+        self.assertEqual(expected_np_dtype, v)
+    else:
+      self.assertEqual(expected_np_dtype, feeder.input_dtype)
     with tf.Graph().as_default() as g, self.test_session(g):
       inp, _ = feeder.input_builder()
-      self.assertEqual(expected_tf_dtype, inp.dtype)
+      if isinstance(inp, dict):
+        for k, v in list(inp.items()):
+          self.assertEqual(expected_tf_dtype, v.dtype)
+      else:
+        self.assertEqual(expected_tf_dtype, inp.dtype)
 
   def test_input_int8(self):
-    self._assert_dtype(
-        np.int8, tf.int8, np.matrix([[1, 2], [3, 4]], dtype=np.int8))
+    data = np.matrix([[1, 2], [3, 4]], dtype=np.int8)
+    self._assert_dtype(np.int8, tf.int8, data)
+    self._assert_dtype(np.int8, tf.int8, self._wrap_dict(data))
 
   def test_input_int16(self):
-    self._assert_dtype(
-        np.int16, tf.int16, np.matrix([[1, 2], [3, 4]], dtype=np.int16))
+    data = np.matrix([[1, 2], [3, 4]], dtype=np.int16)
+    self._assert_dtype(np.int16, tf.int16, data)
+    self._assert_dtype(np.int16, tf.int16, self._wrap_dict(data))
 
   def test_input_int32(self):
-    self._assert_dtype(
-        np.int32, tf.int32, np.matrix([[1, 2], [3, 4]], dtype=np.int32))
+    data = np.matrix([[1, 2], [3, 4]], dtype=np.int32)
+    self._assert_dtype(np.int32, tf.int32, data)
+    self._assert_dtype(np.int32, tf.int32, self._wrap_dict(data))
 
   def test_input_int64(self):
-    self._assert_dtype(
-        np.int64, tf.int64, np.matrix([[1, 2], [3, 4]], dtype=np.int64))
+    data = np.matrix([[1, 2], [3, 4]], dtype=np.int64)
+    self._assert_dtype(np.int64, tf.int64, data)
+    self._assert_dtype(np.int64, tf.int64, self._wrap_dict(data))
 
   def test_input_uint8(self):
-    self._assert_dtype(
-        np.uint8, tf.uint8, np.matrix([[1, 2], [3, 4]], dtype=np.uint8))
+    data = np.matrix([[1, 2], [3, 4]], dtype=np.uint8)
+    self._assert_dtype(np.uint8, tf.uint8, data)
+    self._assert_dtype(np.uint8, tf.uint8, self._wrap_dict(data))
 
   def test_input_uint16(self):
-    self._assert_dtype(
-        np.uint16, tf.uint16, np.matrix([[1, 2], [3, 4]], dtype=np.uint16))
+    data = np.matrix([[1, 2], [3, 4]], dtype=np.uint16)
+    self._assert_dtype(np.uint16, tf.uint16, data)
+    self._assert_dtype(np.uint16, tf.uint16, self._wrap_dict(data))
 
   def test_input_float16(self):
-    self._assert_dtype(
-        np.float16, tf.float16, np.matrix([[1, 2], [3, 4]], dtype=np.float16))
+    data = np.matrix([[1, 2], [3, 4]], dtype=np.float16)
+    self._assert_dtype(np.float16, tf.float16, data)
+    self._assert_dtype(np.float16, tf.float16, self._wrap_dict(data))
 
   def test_input_float32(self):
-    self._assert_dtype(
-        np.float32, tf.float32, np.matrix([[1, 2], [3, 4]], dtype=np.float32))
+    data = np.matrix([[1, 2], [3, 4]], dtype=np.float32)
+    self._assert_dtype(np.float32, tf.float32, data)
+    self._assert_dtype(np.float32, tf.float32, self._wrap_dict(data))
 
   def test_input_float64(self):
-    self._assert_dtype(
-        np.float64, tf.float64, np.matrix([[1, 2], [3, 4]], dtype=np.float64))
+    data = np.matrix([[1, 2], [3, 4]], dtype=np.float64)
+    self._assert_dtype(np.float64, tf.float64, data)
+    self._assert_dtype(np.float64, tf.float64, self._wrap_dict(data))
 
   def test_input_bool(self):
-    self._assert_dtype(
-        np.bool, tf.bool,
-        np.array([[False for _ in xrange(2)] for _ in xrange(2)]))
+    data = np.array([[False for _ in xrange(2)] for _ in xrange(2)])
+    self._assert_dtype(np.bool, tf.bool, data)
+    self._assert_dtype(np.bool, tf.bool, self._wrap_dict(data))
 
   def test_input_string(self):
     input_data = np.array([['str%d' % i for i in xrange(2)] for _ in xrange(2)])
     self._assert_dtype(input_data.dtype, tf.string, input_data)
+    self._assert_dtype(input_data.dtype, tf.string, self._wrap_dict(input_data))
+
+  def _assertAllClose(self, src, dest, src_key_of=None, src_prop=None):
+    def func(x):
+      val = getattr(x, src_prop) if src_prop else x
+      return val if src_key_of is None else src_key_of[val]
+    if isinstance(src, dict):
+      for k in list(src.keys()):
+        self.assertAllClose(func(src[k]), dest)
+    else:
+      self.assertAllClose(func(src), dest)
 
   def test_unsupervised(self):
+    def func(feeder):
+      with self.test_session():
+        inp, _ = feeder.input_builder()
+        feed_dict_fn = feeder.get_feed_dict_fn()
+        feed_dict = feed_dict_fn()
+        self._assertAllClose(inp, [[1, 2]], feed_dict, 'name')
     data = np.matrix([[1, 2], [2, 3], [3, 4]])
-    feeder = data_feeder.DataFeeder(data, None, n_classes=0, batch_size=1)
-    with self.test_session():
-      inp, _ = feeder.input_builder()
-      feed_dict_fn = feeder.get_feed_dict_fn()
-      feed_dict = feed_dict_fn()
-      self.assertAllClose(feed_dict[inp.name], [[1, 2]])
+    func(data_feeder.DataFeeder(data, None, n_classes=0, batch_size=1))
+    func(data_feeder.DataFeeder(self._wrap_dict(data), None, n_classes=0, batch_size=1))
 
   def test_data_feeder_regression(self):
+    def func(df):
+      inp, out = df.input_builder()
+      feed_dict_fn = df.get_feed_dict_fn()
+      feed_dict = feed_dict_fn()
+      self._assertAllClose(inp, [[3, 4], [1, 2]], feed_dict, 'name')
+      self._assertAllClose(out, [2, 1], feed_dict, 'name')
     x = np.matrix([[1, 2], [3, 4]])
     y = np.array([1, 2])
-    df = data_feeder.DataFeeder(x, y, n_classes=0, batch_size=3)
-    inp, out = df.input_builder()
-    feed_dict_fn = df.get_feed_dict_fn()
-    feed_dict = feed_dict_fn()
-
-    self.assertAllClose(feed_dict[inp.name], [[3, 4], [1, 2]])
-    self.assertAllClose(feed_dict[out.name], [2, 1])
+    func(data_feeder.DataFeeder(x, y, n_classes=0, batch_size=3))
+    func(data_feeder.DataFeeder(self._wrap_dict(x, 'in'), self._wrap_dict(y, 'out'),
+                                n_classes=self._wrap_dict(0, 'out'), batch_size=3))
 
   def test_epoch(self):
+    def func(feeder):
+      with self.test_session():
+        feeder.input_builder()
+        epoch = feeder.make_epoch_variable()
+        feed_dict_fn = feeder.get_feed_dict_fn()
+        # First input
+        feed_dict = feed_dict_fn()
+        self.assertAllClose(feed_dict[epoch.name], [0])
+        # Second input
+        feed_dict = feed_dict_fn()
+        self.assertAllClose(feed_dict[epoch.name], [0])
+        # Third input
+        feed_dict = feed_dict_fn()
+        self.assertAllClose(feed_dict[epoch.name], [0])
+        # Back to the first input again, so new epoch.
+        feed_dict = feed_dict_fn()
+        self.assertAllClose(feed_dict[epoch.name], [1])
     data = np.matrix([[1, 2], [2, 3], [3, 4]])
     labels = np.array([0, 0, 1])
-    feeder = data_feeder.DataFeeder(data, labels, n_classes=0, batch_size=1)
-    with self.test_session():
-      feeder.input_builder()
-      epoch = feeder.make_epoch_variable()
-      feed_dict_fn = feeder.get_feed_dict_fn()
-      # First input
-      feed_dict = feed_dict_fn()
-      self.assertAllClose(feed_dict[epoch.name], [0])
-      # Second input
-      feed_dict = feed_dict_fn()
-      self.assertAllClose(feed_dict[epoch.name], [0])
-      # Third input
-      feed_dict = feed_dict_fn()
-      self.assertAllClose(feed_dict[epoch.name], [0])
-      # Back to the first input again, so new epoch.
-      feed_dict = feed_dict_fn()
-      self.assertAllClose(feed_dict[epoch.name], [1])
+    func(data_feeder.DataFeeder(data, labels, n_classes=0, batch_size=1))
+    func(data_feeder.DataFeeder(self._wrap_dict(data, 'in'), self._wrap_dict(labels, 'out'),
+                                n_classes=self._wrap_dict(0, 'out'), batch_size=1))
 
   def test_data_feeder_multioutput_regression(self):
+    def func(df):
+      inp, out = df.input_builder()
+      feed_dict_fn = df.get_feed_dict_fn()
+      feed_dict = feed_dict_fn()
+      self._assertAllClose(inp, [[3, 4], [1, 2]], feed_dict, 'name')
+      self._assertAllClose(out, [[3, 4], [1, 2]], feed_dict, 'name')
     x = np.matrix([[1, 2], [3, 4]])
     y = np.array([[1, 2], [3, 4]])
-    df = data_feeder.DataFeeder(x, y, n_classes=0, batch_size=2)
-    inp, out = df.input_builder()
-    feed_dict_fn = df.get_feed_dict_fn()
-    feed_dict = feed_dict_fn()
-    self.assertAllClose(feed_dict[inp.name], [[3, 4], [1, 2]])
-    self.assertAllClose(feed_dict[out.name], [[3, 4], [1, 2]])
+    func(data_feeder.DataFeeder(x, y, n_classes=0, batch_size=2))
+    func(data_feeder.DataFeeder(self._wrap_dict(x, 'in'), self._wrap_dict(y, 'out'),
+                                n_classes=self._wrap_dict(0, 'out'), batch_size=2))
 
   def test_data_feeder_multioutput_classification(self):
+    def func(df):
+      inp, out = df.input_builder()
+      feed_dict_fn = df.get_feed_dict_fn()
+      feed_dict = feed_dict_fn()
+      self._assertAllClose(inp, [[3, 4], [1, 2]], feed_dict, 'name')
+      self._assertAllClose(out,
+                          [[[0, 0, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1]],
+                           [[1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1, 0, 0]]],
+                          feed_dict, 'name')
+
     x = np.matrix([[1, 2], [3, 4]])
     y = np.array([[0, 1, 2], [2, 3, 4]])
-    df = data_feeder.DataFeeder(x, y, n_classes=5, batch_size=2)
-    inp, out = df.input_builder()
-    feed_dict_fn = df.get_feed_dict_fn()
-    feed_dict = feed_dict_fn()
-    self.assertAllClose(feed_dict[inp.name], [[3, 4], [1, 2]])
-    self.assertAllClose(feed_dict[out.name],
-                        [[[0, 0, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1]],
-                         [[1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1, 0, 0]]])
+    func(data_feeder.DataFeeder(x, y, n_classes=5, batch_size=2))
+    func(data_feeder.DataFeeder(self._wrap_dict(x, 'in'), self._wrap_dict(y, 'out'),
+                                n_classes=self._wrap_dict(5, 'out'), batch_size=2))
 
   def test_streaming_data_feeder(self):
+    def func(df):
+      inp, out = df.input_builder()
+      feed_dict_fn = df.get_feed_dict_fn()
+      feed_dict = feed_dict_fn()
+      self._assertAllClose(inp, [[1, 2], [3, 4]], feed_dict, 'name')
+      self._assertAllClose(out, [1, 2], feed_dict, 'name' )
 
-    def x_iter():
-      yield np.array([1, 2])
-      yield np.array([3, 4])
+    def x_iter(wrap_dict=False):
+      yield np.array([1, 2]) if not wrap_dict else self._wrap_dict(np.array([1, 2]), 'in')
+      yield np.array([3, 4]) if not wrap_dict else self._wrap_dict(np.array([3, 4]), 'in')
 
-    def y_iter():
-      yield np.array([1])
-      yield np.array([2])
+    def y_iter(wrap_dict=False):
+      yield np.array([1]) if not wrap_dict else self._wrap_dict(np.array([1]), 'out')
+      yield np.array([2]) if not wrap_dict else self._wrap_dict(np.array([2]), 'out')
 
-    df = data_feeder.StreamingDataFeeder(x_iter(),
-                                         y_iter(),
-                                         n_classes=0,
-                                         batch_size=2)
-    inp, out = df.input_builder()
-    feed_dict_fn = df.get_feed_dict_fn()
-    feed_dict = feed_dict_fn()
-    self.assertAllClose(feed_dict[inp.name], [[1, 2], [3, 4]])
-    self.assertAllClose(feed_dict[out.name], [1, 2])
+    func(data_feeder.StreamingDataFeeder(x_iter(), y_iter(), n_classes=0, batch_size=2))
+    func(data_feeder.StreamingDataFeeder(x_iter(True), y_iter(True),
+                                         n_classes=self._wrap_dict(0, 'out'), batch_size=2))
 
   def test_dask_data_feeder(self):
     if HAS_PANDAS and HAS_DASK:
@@ -196,6 +246,13 @@ class DataFeederTest(tf.test.TestCase):
       self.assertAllClose(feed_dict[out.name], [[0., 0., 1.], [0., 1., 0.]])
 
   def test_hdf5_data_feeder(self):
+    def func(df):
+      inp, out = df.input_builder()
+      feed_dict_fn = df.get_feed_dict_fn()
+      feed_dict = feed_dict_fn()
+      self._assertAllClose(inp, [[3, 4], [1, 2]], feed_dict, 'name')
+      self.assertAllClose(out, [2, 1], feed_dict, 'name')
+
     try:
       import h5py  # pylint: disable=g-import-not-at-top
       x = np.matrix([[1, 2], [3, 4]])
@@ -207,25 +264,28 @@ class DataFeederTest(tf.test.TestCase):
       h5f = h5py.File('test_hdf5.h5', 'r')
       x = h5f['x']
       y = h5f['y']
-      df = data_feeder.DataFeeder(x, y, n_classes=0, batch_size=3)
-      inp, out = df.input_builder()
-      feed_dict_fn = df.get_feed_dict_fn()
-      feed_dict = feed_dict_fn()
-      self.assertAllClose(feed_dict[inp.name], [[3, 4], [1, 2]])
-      self.assertAllClose(feed_dict[out.name], [2, 1])
+      func(data_feeder.DataFeeder(x, y, n_classes=0, batch_size=3))
+      func(data_feeder.DataFeeder(self._wrap_dict(x, 'in'), self._wrap_dict(y, 'out'),
+                                  n_classes=self._wrap_dict(0, 'out'), batch_size=3))
     except ImportError:
       print("Skipped test for hdf5 since it's not installed.")
 
 
-class SetupPredictDataFeederTest(tf.test.TestCase):
+class SetupPredictDataFeederTest(DataFeederTest):
   """Tests for `DataFeeder.setup_predict_data_feeder`."""
 
   def test_iterable_data(self):
     # pylint: disable=undefined-variable
-    x = iter([[1, 2], [3, 4], [5, 6]])
-    df = data_feeder.setup_predict_data_feeder(x, batch_size=2)
-    self.assertAllClose(six.next(df), [[1, 2], [3, 4]])
-    self.assertAllClose(six.next(df), [[5, 6]])
+
+    def func(df):
+      self._assertAllClose(six.next(df), [[1, 2], [3, 4]])
+      self._assertAllClose(six.next(df), [[5, 6]])
+
+    data = [[1, 2], [3, 4], [5, 6]]
+    x = iter(data)
+    x_dict = iter([self._wrap_dict(v) for v in iter(data)])
+    func(data_feeder.setup_predict_data_feeder(x, batch_size=2))
+    func(data_feeder.setup_predict_data_feeder(x_dict, batch_size=2))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/learn/python/learn/trainable.py b/tensorflow/contrib/learn/python/learn/trainable.py
index 8a1548738e..2d1d460425 100644
--- a/tensorflow/contrib/learn/python/learn/trainable.py
+++ b/tensorflow/contrib/learn/python/learn/trainable.py
@@ -33,17 +33,17 @@ class Trainable(object):
     """Trains a model given training data `x` predictions and `y` labels.
 
     Args:
-      x: Matrix of shape [n_samples, n_features...]. Can be iterator that
-         returns arrays of features. The training input samples for fitting the
-         model. If set, `input_fn` must be `None`.
-      y: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-         iterator that returns array of labels. The training label values
-         (class labels in classification, real numbers in regression). If set,
-         `input_fn` must be `None`. Note: For classification, label values must
+      x: Matrix of shape [n_samples, n_features...] or the dictionary of Matrices.
+         Can be iterator that returns arrays of features or dictionary of arrays of features.
+         The training input samples for fitting the model. If set, `input_fn` must be `None`.
+      y: Vector or matrix [n_samples] or [n_samples, n_outputs] or the dictionary of same.
+         Can be iterator that returns array of labels or dictionary of array of labels.
+         The training label values (class labels in classification, real numbers in regression).
+         If set, `input_fn` must be `None`. Note: For classification, label values must
          be integers representing the class index (i.e. values from 0 to
          n_classes-1).
       input_fn: Input function returning a tuple of:
-          features - Dictionary of string feature name to `Tensor` or `Tensor`.
+          features - `Tensor` or dictionary of string feature name to `Tensor`.
           labels - `Tensor` or dictionary of `Tensor` with labels.
         If input_fn is set, `x`, `y`, and `batch_size` must be `None`.
       steps: Number of steps for which to train model. If `None`, train forever.
@@ -67,4 +67,3 @@ class Trainable(object):
       `self`, for chaining.
     """
     raise NotImplementedError
-
diff --git a/tensorflow/contrib/learn/python/learn/utils/__init__.py b/tensorflow/contrib/learn/python/learn/utils/__init__.py
index 149a4b9772..f313699c14 100644
--- a/tensorflow/contrib/learn/python/learn/utils/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/utils/__init__.py
@@ -19,5 +19,4 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.learn.python.learn.utils import checkpoints
 from tensorflow.contrib.learn.python.learn.utils.export import export_estimator
diff --git a/tensorflow/contrib/makefile/README.md b/tensorflow/contrib/makefile/README.md
index b010a4387b..9f7686d92f 100644
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@@ -142,10 +142,11 @@ xcode-select --install
 If this is a new install, you will need to run XCode once to agree to the
 license before continuing.
 
-Then install [automake](https://en.wikipedia.org/wiki/Automake):
+Then install [automake](https://en.wikipedia.org/wiki/Automake)/[libtool](https://en.wikipedia.org/wiki/GNU_Libtool):
 
 ```bash
 brew install automake
+brew install libtool
 ```
 
 Also, download the graph if you haven't already:
diff --git a/tensorflow/contrib/makefile/compile_ios_protobuf.sh b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
index 42e6231ee0..c413924538 100755
--- a/tensorflow/contrib/makefile/compile_ios_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
@@ -67,7 +67,6 @@ fi
 
 make distclean
 ./configure \
---build=x86_64-apple-${OSX_VERSION} \
 --host=i386-apple-${OSX_VERSION} \
 --disable-shared \
 --enable-cross-compile \
@@ -95,7 +94,6 @@ make install
 
 make distclean
 ./configure \
---build=x86_64-apple-${OSX_VERSION} \
 --host=x86_64-apple-${OSX_VERSION} \
 --disable-shared \
 --enable-cross-compile \
@@ -123,7 +121,6 @@ make install
 
 make distclean
 ./configure \
---build=x86_64-apple-${OSX_VERSION} \
 --host=armv7-apple-${OSX_VERSION} \
 --with-protoc="${PROTOC_PATH}" \
 --disable-shared \
@@ -147,7 +144,6 @@ make install
 
 make distclean
 ./configure \
---build=x86_64-apple-${OSX_VERSION} \
 --host=armv7s-apple-${OSX_VERSION} \
 --with-protoc="${PROTOC_PATH}" \
 --disable-shared \
@@ -171,7 +167,6 @@ make install
 
 make distclean
 ./configure \
---build=x86_64-apple-${OSX_VERSION} \
 --host=arm \
 --with-protoc="${PROTOC_PATH}" \
 --disable-shared \
diff --git a/tensorflow/contrib/metrics/python/ops/set_ops.py b/tensorflow/contrib/metrics/python/ops/set_ops.py
index 9b80d08830..bca8334110 100644
--- a/tensorflow/contrib/metrics/python/ops/set_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/set_ops.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 from tensorflow.python.ops import sets
 
-
 set_size = sets.set_size
 
 set_intersection = sets.set_intersection
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py b/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py
index a3d7c9745e..b6903eee29 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py
@@ -158,7 +158,7 @@ class GRUBlockCellTest(tf.test.TestCase):
         output = gru_ops.GRUBlockCell(cell_size)(x, h)
         sess.run([tf.global_variables_initializer()])
 
-        all_variables = tf.all_variables()[0:4]
+        all_variables = tf.global_variables()[0:4]
         [w_ru, b_ru, w_c, b_c] = all_variables
 
         d_new_h_wrt_x = tf.gradients([output], x)
@@ -178,7 +178,7 @@ class GRUBlockCellTest(tf.test.TestCase):
         output = tf.contrib.rnn.GRUCell(cell_size)(x, h)
         sess.run([tf.global_variables_initializer()])
 
-        all_variables = tf.all_variables()[4:8]
+        all_variables = tf.global_variables()[4:8]
         [w_ru, b_ru, w_c, b_c] = all_variables
 
         d_new_h_wrt_x = tf.gradients([output], x)
@@ -281,7 +281,7 @@ class GRUBlockCellTest(tf.test.TestCase):
 
       sess.run([tf.global_variables_initializer()])
 
-      all_variables = tf.all_variables()
+      all_variables = tf.global_variables()
 
       [w_ru, b_ru, w_c, b_c] = all_variables[:4]
 
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_test.py
index f024f9b92b..8374b505a7 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_test.py
@@ -382,7 +382,7 @@ class StackBidirectionalRNNTest(tf.test.TestCase):
 
       # check that all the variables names starts with the proper scope.
       tf.global_variables_initializer()
-      all_vars = tf.all_variables()
+      all_vars = tf.global_variables()
       prefix = prefix or "stack_bidirectional_rnn"
       scope_vars = [v for v in all_vars if v.name.startswith(prefix + "/")]
       tf.logging.info("StackRNN with scope: %s (%s)"
diff --git a/tensorflow/contrib/seq2seq/BUILD b/tensorflow/contrib/seq2seq/BUILD
index d5f0abe6a7..178bd81afb 100644
--- a/tensorflow/contrib/seq2seq/BUILD
+++ b/tensorflow/contrib/seq2seq/BUILD
@@ -16,9 +16,9 @@ py_library(
 )
 
 cuda_py_test(
-    name = "layers_test",
+    name = "decoder_fn_test",
     size = "medium",
-    srcs = ["python/kernel_tests/layers_test.py"],
+    srcs = ["python/kernel_tests/decoder_fn_test.py"],
     additional_deps = [
         ":seq2seq_py",
         "//tensorflow:tensorflow_py",
@@ -28,9 +28,9 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "loss_test",
+    name = "seq2seq_test",
     size = "medium",
-    srcs = ["python/kernel_tests/loss_test.py"],
+    srcs = ["python/kernel_tests/seq2seq_test.py"],
     additional_deps = [
         ":seq2seq_py",
         "//tensorflow:tensorflow_py",
diff --git a/tensorflow/contrib/seq2seq/__init__.py b/tensorflow/contrib/seq2seq/__init__.py
index e67e4a7ca9..2627e31426 100644
--- a/tensorflow/contrib/seq2seq/__init__.py
+++ b/tensorflow/contrib/seq2seq/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Ops for building neural network seq2seq layers and losses."""
+"""Ops for building neural network seq2seq decoders and losses."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,6 +22,7 @@ from __future__ import print_function
 import sys
 
 # pylint: disable=unused-import,line-too-long
-from tensorflow.contrib.seq2seq.python.ops import layers
-from tensorflow.contrib.seq2seq.python.ops import loss
+from tensorflow.contrib.seq2seq.python.ops.decoder_fn import *
+from tensorflow.contrib.seq2seq.python.ops.loss import *
+from tensorflow.contrib.seq2seq.python.ops.seq2seq import *
 # pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/__init__.py b/tensorflow/contrib/seq2seq/python/kernel_tests/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/__init__.py
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/layers_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_fn_test.py
index b4eaec658a..5e6dada294 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/layers_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_fn_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Tests for contrib.seq2seq.python.seq2seq.layers_ops."""
+"""Tests for contrib.seq2seq.python.seq2seq.loss_ops."""
 # pylint: disable=unused-import,g-bad-import-order
 from __future__ import absolute_import
 from __future__ import division
@@ -23,12 +23,9 @@ from __future__ import print_function
 import tensorflow as tf
 
 
-class LayersTest(tf.test.TestCase):
+class DecoderFnTest(tf.test.TestCase):
 
-  def testRNNDecoder(self):
-    pass
-
-  def testRNNDecoderAttention(self):
+  def testDecoderFn(self):
     pass
 
 
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/seq2seq_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/seq2seq_test.py
new file mode 100644
index 0000000000..f71285b6d9
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/seq2seq_test.py
@@ -0,0 +1,129 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for contrib.seq2seq.python.ops.seq2seq."""
+# pylint: disable=unused-import,g-bad-import-order
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+# pylint: enable=unused-import
+
+import tensorflow as tf
+from tensorflow.contrib import layers
+
+class Seq2SeqTest(tf.test.TestCase):
+
+  # test a default call of rnn_decoder
+  def test_rnn_decoder(self):
+    pass
+
+  # test default call with time_major=True
+  def test_dynamic_rnn_decoder_time_major(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=
+                             tf.constant_initializer(0.5)) as varscope:
+        # Define inputs/outputs to model
+        batch_size = 2
+        encoder_embedding_size = 3
+        decoder_embedding_size = 4
+        encoder_hidden_size = 5
+        decoder_hidden_size = encoder_hidden_size
+        input_sequence_length = 6
+        decoder_sequence_length = 7
+        num_decoder_symbols = 20
+        start_of_sequence_id = end_of_sequence_id = 1
+        decoder_embeddings = tf.get_variable('decoder_embeddings',
+            [num_decoder_symbols, decoder_embedding_size],
+            initializer=tf.random_normal_initializer(stddev=0.1))
+        inputs = tf.constant(0.5, shape=[input_sequence_length, batch_size,
+                                         encoder_embedding_size])
+        decoder_inputs = tf.constant(0.4, shape=[decoder_sequence_length,
+                                                 batch_size,
+                                                 decoder_embedding_size])
+        decoder_length = tf.constant(decoder_sequence_length, dtype=tf.int32,
+                                     shape=[batch_size,])
+        with tf.variable_scope("rnn") as scope:
+          # setting up weights for computing the final output
+          output_fn = lambda x: layers.linear(x, num_decoder_symbols,
+                                              scope=scope)
+
+          # Define model
+          encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
+              cell=tf.nn.rnn_cell.GRUCell(encoder_hidden_size), inputs=inputs,
+              dtype=tf.float32, time_major=True, scope=scope)
+
+
+        with tf.variable_scope("decoder") as scope:
+          # Train decoder
+          decoder_cell = tf.nn.rnn_cell.GRUCell(decoder_hidden_size)
+          decoder_fn_train = tf.contrib.seq2seq.simple_decoder_fn_train(
+              encoder_state=encoder_state)
+          decoder_outputs_train, decoder_state_train = (
+            tf.contrib.seq2seq.dynamic_rnn_decoder(
+                cell=decoder_cell,
+                decoder_fn=decoder_fn_train,
+                inputs=decoder_inputs,
+                sequence_length=decoder_length,
+                time_major=True,
+                scope=scope))
+          decoder_outputs_train = output_fn(decoder_outputs_train)
+
+          # Setup variable reuse
+          scope.reuse_variables()
+
+          # Inference decoder
+          decoder_fn_inference = (
+              tf.contrib.seq2seq.simple_decoder_fn_inference(
+                output_fn=output_fn,
+                encoder_state=encoder_state,
+                embeddings=decoder_embeddings,
+                start_of_sequence_id=start_of_sequence_id,
+                end_of_sequence_id=end_of_sequence_id,
+                #TODO: find out why it goes to +1
+                maximum_length=decoder_sequence_length-1,
+                num_decoder_symbols=num_decoder_symbols,
+                dtype=tf.int32))
+          decoder_outputs_inference, decoder_state_inference = (
+              tf.contrib.seq2seq.dynamic_rnn_decoder(
+                cell=decoder_cell,
+                decoder_fn=decoder_fn_inference,
+                time_major=True,
+                scope=scope))
+
+        # Run model
+        tf.global_variables_initializer().run()
+        decoder_outputs_train_res, decoder_state_train_res = sess.run(
+            [decoder_outputs_train, decoder_state_train])
+        decoder_outputs_inference_res, decoder_state_inference_res = sess.run(
+            [decoder_outputs_inference, decoder_state_inference])
+
+        # Assert outputs
+        self.assertEqual((decoder_sequence_length, batch_size,
+                          num_decoder_symbols),
+                         decoder_outputs_train_res.shape)
+        self.assertEqual((batch_size, num_decoder_symbols),
+                         decoder_outputs_inference_res.shape[1:3])
+        self.assertEqual((batch_size, decoder_hidden_size),
+                         decoder_state_train_res.shape)
+        self.assertEqual((batch_size, decoder_hidden_size),
+                         decoder_state_inference_res.shape)
+        # The dynamic decoder might end earlier than `maximal_length`
+        # under inference
+        true_value = (decoder_sequence_length>=
+                      decoder_state_inference_res.shape[0])
+        self.assertEqual((true_value), True)
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/seq2seq/python/ops/__init__.py b/tensorflow/contrib/seq2seq/python/ops/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/ops/__init__.py
diff --git a/tensorflow/contrib/seq2seq/python/ops/decoder_fn.py b/tensorflow/contrib/seq2seq/python/ops/decoder_fn.py
new file mode 100644
index 0000000000..d02efdc521
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/ops/decoder_fn.py
@@ -0,0 +1,249 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Seq2seq loss operations for use in neural networks.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.util import nest
+
+__all__ = ["simple_decoder_fn_train",
+           "simple_decoder_fn_inference"]
+
+def simple_decoder_fn_train(encoder_state, name=None):
+  """ Simple decoder function for a sequence-to-sequence model used in the
+  `dynamic_rnn_decoder`.
+
+  The `simple_decoder_fn_train` is a simple training function for a
+  sequence-to-sequence model. It should be used when `dynamic_rnn_decoder` is
+  in the training mode.
+
+  The `simple_decoder_fn_train` is called with a set of the user arguments and
+  returns the `decoder_fn`, which can be passed to the `dynamic_rnn_decoder`,
+  such that
+
+  ```
+  dynamic_fn_train = simple_decoder_fn_train(encoder_state)
+  outputs_train, state_train = dynamic_rnn_decoder(
+      decoder_fn=dynamic_fn_train, ...)
+  ```
+
+  Further usage can be found in the `kernel_tests/seq2seq_test.py`.
+
+  Args:
+    encoder_state: The encoded state to initialize the `dynamic_rnn_decoder`.
+    name: (default: `None`) NameScope for the decoder function;
+      defaults to "simple_decoder_fn_train"
+
+  Returns:
+    A decoder function with the required interface of `dynamic_rnn_decoder`
+    intended for training.
+  """
+  with ops.name_scope(name, "simple_decoder_fn_train", [encoder_state]):
+    pass
+
+  def decoder_fn(time, cell_state, cell_input, cell_output, context_state):
+    """ Decoder function used in the `dynamic_rnn_decoder` with the purpose of
+    training.
+
+    Args:
+      time: positive integer constant reflecting the current timestep.
+      cell_state: state of RNNCell.
+      cell_input: input provided by `dynamic_rnn_decoder`.
+      cell_output: output of RNNCell.
+      context_state: context state provided by `dynamic_rnn_decoder`.
+
+    Returns:
+      A tuple (done, next state, next input, emit output, next context state)
+      where:
+
+      done: `None`, which is used by the `dynamic_rnn_decoder` to indicate
+      that `sequence_lengths` in `dynamic_rnn_decoder` should be used.
+
+      next state: `cell_state`, this decoder function does not modify the
+      given state.
+
+      next input: `cell_input`, this decoder function does not modify the
+      given input. The input could be modified when applying e.g. attention.
+
+      emit output: `cell_output`, this decoder function does not modify the
+      given output.
+
+      next context state: `context_state`, this decoder function does not
+      modify the given context state. The context state could be modified when
+      applying e.g. beam search.
+  """
+    with ops.name_scope(name, "simple_decoder_fn_train",
+                        [time, cell_state, cell_input, cell_output,
+                         context_state]):
+      if cell_state is None:  # first call, return encoder_state
+        return (None, encoder_state, cell_input, cell_output, context_state)
+      else:
+        return (None, cell_state, cell_input, cell_output, context_state)
+  return decoder_fn
+
+
+def simple_decoder_fn_inference(output_fn, encoder_state, embeddings,
+                                start_of_sequence_id, end_of_sequence_id,
+                                maximum_length, num_decoder_symbols,
+                                dtype=dtypes.int32, name=None):
+  """ Simple decoder function for a sequence-to-sequence model used in the
+  `dynamic_rnn_decoder`.
+
+  The `simple_decoder_fn_inference` is a simple inference function for a
+  sequence-to-sequence model. It should be used when `dynamic_rnn_decoder` is
+  in the inference mode.
+
+  The `simple_decoder_fn_inference` is called with a set of the user arguments
+  and returns the `decoder_fn`, which can be passed to the
+  `dynamic_rnn_decoder`, such that
+
+  ```
+  dynamic_fn_inference = simple_decoder_fn_inference(...)
+  outputs_inference, state_inference = dynamic_rnn_decoder(
+      decoder_fn=dynamic_fn_inference, ...)
+  ```
+
+  Further usage can be found in the `kernel_tests/seq2seq_test.py`.
+
+  Args:
+    output_fn: An output function to project your `cell_output` onto class
+    logits.
+
+    An example of an output function;
+
+    ```
+      tf.variable_scope("decoder") as varscope
+        output_fn = lambda x: layers.linear(x, num_decoder_symbols,
+                                            scope=varscope)
+
+        outputs_train, state_train = seq2seq.dynamic_rnn_decoder(...)
+        logits_train = output_fn(outputs_train)
+
+        varscope.reuse_variables()
+        logits_inference, state_inference = seq2seq.dynamic_rnn_decoder(
+            output_fn=output_fn, ...)
+    ```
+
+    If `None` is supplied it will act as an identity function, which
+    might be wanted when using the RNNCell `OutputProjectionWrapper`.
+
+    encoder_state: The encoded state to initialize the `dynamic_rnn_decoder`.
+    embeddings: The embeddings matrix used for the decoder sized
+    `[num_decoder_symbols, embedding_size]`.
+    start_of_sequence_id: The start of sequence ID in the decoder embeddings.
+    end_of_sequence_id: The end of sequence ID in the decoder embeddings.
+    maximum_length: The maximum allowed of time steps to decode.
+    num_decoder_symbols: The number of classes to decode at each time step.
+    dtype: (default: `dtypes.int32`) The default data type to use when
+    handling integer objects.
+    name: (default: `None`) NameScope for the decoder function;
+      defaults to "simple_decoder_fn_inference"
+
+  Returns:
+    A decoder function with the required interface of `dynamic_rnn_decoder`
+    intended for inference.
+  """
+  with ops.name_scope(name, "simple_decoder_fn_inference",
+                      [output_fn, encoder_state, embeddings,
+                       start_of_sequence_id, end_of_sequence_id,
+                       maximum_length, num_decoder_symbols, dtype]):
+    start_of_sequence_id = ops.convert_to_tensor(start_of_sequence_id, dtype)
+    end_of_sequence_id = ops.convert_to_tensor(end_of_sequence_id, dtype)
+    maximum_length = ops.convert_to_tensor(maximum_length, dtype)
+    num_decoder_symbols = ops.convert_to_tensor(num_decoder_symbols, dtype)
+    encoder_info = nest.flatten(encoder_state)[0]
+    batch_size = encoder_info.get_shape()[0].value
+    if output_fn is None:
+      output_fn = lambda x: x
+    if batch_size is None:
+      batch_size = array_ops.shape(encoder_info)[0]
+
+  def decoder_fn(time, cell_state, cell_input, cell_output, context_state):
+    """ Decoder function used in the `dynamic_rnn_decoder` with the purpose of
+    inference.
+
+    The main difference between this decoder function and the `decoder_fn` in
+    `simple_decoder_fn_train` is how `next_cell_input` is calculated. In this
+    decoder function we calculate the next input by applying an argmax across
+    the feature dimension of the output from the decoder. This is a
+    greedy-search approach. (Bahdanau et al., 2014) & (Sutskever et al., 2014)
+    use beam-search instead.
+
+    Args:
+      time: positive integer constant reflecting the current timestep.
+      cell_state: state of RNNCell.
+      cell_input: input provided by `dynamic_rnn_decoder`.
+      cell_output: output of RNNCell.
+      context_state: context state provided by `dynamic_rnn_decoder`.
+
+    Returns:
+      A tuple (done, next state, next input, emit output, next context state)
+      where:
+
+      done: A boolean vector to indicate which sentences has reached a
+      `end_of_sequence_id`. This is used for early stopping by the
+      `dynamic_rnn_decoder`. When `time>=maximum_length` a boolean vector with
+      all elements as `true` is returned.
+
+      next state: `cell_state`, this decoder function does not modify the
+      given state.
+
+      next input: The embedding from argmax of the `cell_output` is used as
+      `next_input`.
+
+      emit output: If `output_fn is None` the supplied `cell_output` is
+      returned, else the `output_fn` is used to update the `cell_output`
+      before calculating `next_input` and returning `cell_output`.
+
+      next context state: `context_state`, this decoder function does not
+      modify the given context state. The context state could be modified when
+      applying e.g. beam search.
+  """
+    with ops.name_scope(name, "simple_decoder_fn_inference",
+                        [time, cell_state, cell_input, cell_output,
+                         context_state]):
+      if cell_input is not None:
+        raise ValueError("Expected cell_input to be None, but saw: %s" %
+                         cell_input)
+      if cell_output is None:
+        # invariant that this is time == 0
+        next_input_id = array_ops.ones([batch_size,], dtype=dtype) * (
+            start_of_sequence_id)
+        done = array_ops.zeros([batch_size,], dtype=dtypes.bool)
+        cell_state = encoder_state
+        cell_output = array_ops.zeros([num_decoder_symbols],
+                                      dtype=dtypes.float32)
+      else:
+        cell_output = output_fn(cell_output)
+        next_input_id = math_ops.cast(
+            math_ops.argmax(cell_output, 1), dtype=dtype)
+        done = math_ops.equal(next_input_id, end_of_sequence_id)
+      next_input = array_ops.gather(embeddings, next_input_id)
+      # if time > maxlen, return all true vector
+      done = control_flow_ops.cond(math_ops.greater(time, maximum_length),
+          lambda: array_ops.ones([batch_size,], dtype=dtypes.bool),
+          lambda: done)
+      return (done, cell_state, next_input, cell_output, context_state)
+  return decoder_fn
diff --git a/tensorflow/contrib/seq2seq/python/ops/seq2seq.py b/tensorflow/contrib/seq2seq/python/ops/seq2seq.py
new file mode 100644
index 0000000000..4e15d669cb
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/ops/seq2seq.py
@@ -0,0 +1,208 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Seq2seq layer operations for use in neural networks.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib import layers
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variable_scope as vs
+
+__all__ = ["dynamic_rnn_decoder"]
+
+def dynamic_rnn_decoder(cell, decoder_fn, inputs=None, sequence_length=None,
+                        parallel_iterations=None, swap_memory=False,
+                        time_major=False, scope=None, name=None):
+  """ Dynamic RNN decoder for a sequence-to-sequence model specified by
+  RNNCell and decoder function.
+
+  The `dynamic_rnn_decoder` is similar to the `tf.python.ops.rnn.dynamic_rnn`
+  as the decoder does not make any assumptions of sequence length and batch
+  size of the input.
+
+  The `dynamic_rnn_decoder` has two modes: training or inference and expects
+  the user to create seperate functions for each.
+
+  Under both training and inference `cell` and `decoder_fn` is expected. Where
+  the `cell` performs computation at every timestep using the `raw_rnn` and
+  the `decoder_fn` allows modelling of early stopping, output, state, and next
+  input and context.
+
+  When training the user is expected to supply `inputs`. At every time step a
+  slice of the supplied input is fed to the `decoder_fn`, which modifies and
+  returns the input for the next time step.
+
+  `sequence_length` is needed at training time, i.e., when `inputs` is not
+  None, for dynamic unrolling. At test time, when `inputs` is None,
+  `sequence_length` is not needed.
+
+  Under inference `inputs` is expected to be `None` and the input is inferred
+  solely from the `decoder_fn`.
+
+  Args:
+    cell: An instance of RNNCell.
+    decoder_fn: A function that takes time, cell state, cell input,
+      cell output and context state. It returns a early stopping vector,
+      cell state, next input, cell output and context state.
+      Examples of decoder_fn can be found in the decoder_fn.py folder.
+    inputs: The inputs for decoding (embedded format).
+
+      If `time_major == False` (default), this must be a `Tensor` of shape:
+        `[batch_size, max_time, ...]`.
+
+      If `time_major == True`, this must be a `Tensor` of shape:
+        `[max_time, batch_size, ...]`.
+
+      The input to `cell` at each time step will be a `Tensor` with dimensions
+        `[batch_size, ...]`.
+    sequence_length: (optional) An int32/int64 vector sized `[batch_size]`.
+      if `inputs` is not None and `sequence_length` is None it is inferred
+      from the `inputs` as the maximal possible sequence length.
+    parallel_iterations: (Default: 32).  The number of iterations to run in
+      parallel.  Those operations which do not have any temporal dependency
+      and can be run in parallel, will be.  This parameter trades off
+      time for space.  Values >> 1 use more memory but take less time,
+      while smaller values use less memory but computations take longer.
+    swap_memory: Transparently swap the tensors produced in forward inference
+      but needed for back prop from GPU to CPU.  This allows training RNNs
+      which would typically not fit on a single GPU, with very minimal (or no)
+      performance penalty.
+    time_major: The shape format of the `inputs` and `outputs` Tensors.
+      If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
+      If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
+      Using `time_major = True` is a bit more efficient because it avoids
+      transposes at the beginning and end of the RNN calculation.  However,
+      most TensorFlow data is batch-major, so by default this function
+      accepts input and emits output in batch-major form.
+    scope: VariableScope for the `raw_rnn`;
+      defaults to None.
+    name: NameScope for the decoder;
+      defaults to "dynamic_rnn_decoder"
+
+  Returns:
+    A pair (outputs, state) where:
+
+      outputs: the RNN output 'Tensor'.
+
+        If time_major == False (default), this will be a `Tensor` shaped:
+          `[batch_size, max_time, cell.output_size]`.
+
+        If time_major == True, this will be a `Tensor` shaped:
+          `[max_time, batch_size, cell.output_size]`.
+
+      state: The final state and will be shaped
+             `[batch_size, cell.state_size]`.
+
+  Raises:
+    ValueError: if inputs is not None and has less than three dimensions.
+  """
+  with ops.name_scope(name, "dynamic_rnn_decoder",
+                      [cell, decoder_fn, inputs, sequence_length,
+                       parallel_iterations, swap_memory, time_major, scope]):
+    if inputs is not None:
+      # Convert to tensor
+      inputs = ops.convert_to_tensor(inputs)
+
+      # Test input dimensions
+      if inputs.get_shape().ndims is not None and (
+          inputs.get_shape().ndims < 2):
+        raise ValueError("Inputs must have at least two dimensions")
+      # Setup of RNN (dimensions, sizes, length, initial state, dtype)
+      if not time_major:
+        # [batch, seq, features] -> [seq, batch, features]
+        inputs = array_ops.transpose(inputs, perm=[1, 0, 2])
+
+      dtype = inputs.dtype
+      # Get data input information
+      input_depth = int(inputs.get_shape()[2])
+      batch_depth = inputs.get_shape()[1].value
+      max_time = inputs.get_shape()[0].value
+      if max_time is None:
+        max_time = array_ops.shape(inputs)[0]
+      # Setup decoder inputs as TensorArray
+      inputs_ta = tensor_array_ops.TensorArray(dtype, size=max_time)
+      inputs_ta = inputs_ta.unpack(inputs)
+
+    def loop_fn(time, cell_output, cell_state, loop_state):
+      if cell_state is None:  # first call, before while loop (in raw_rnn)
+        if cell_output is not None:
+          raise ValueError("Expected cell_output to be None when cell_state "
+                           "is None, but saw: %s" % cell_output)
+        if loop_state is not None:
+          raise ValueError("Expected loop_state to be None when cell_state "
+                           "is None, but saw: %s" % loop_state)
+        context_state = None
+      else:  # subsequent calls, inside while loop, after cell excution
+        if isinstance(loop_state, tuple):
+          (done, context_state) = loop_state
+        else:
+          done = loop_state
+          context_state = None
+
+      # call decoder function
+      if inputs is not None:  # training
+        # get next_cell_input
+        if cell_state is None:
+          next_cell_input = inputs_ta.read(0)
+        else:
+          if batch_depth is not None:
+            batch_size = batch_depth
+          else:
+            batch_size = array_ops.shape(done)[0]
+          next_cell_input = control_flow_ops.cond(
+              math_ops.equal(time, max_time),
+              lambda: array_ops.zeros([batch_size, input_depth], dtype=dtype),
+              lambda: inputs_ta.read(time))
+        (next_done, next_cell_state, next_cell_input, emit_output,
+         next_context_state) = decoder_fn(time, cell_state, next_cell_input,
+                                          cell_output, context_state)
+      else:  # inference
+        # next_cell_input is obtained through decoder_fn
+        (next_done, next_cell_state, next_cell_input, emit_output,
+         next_context_state) = decoder_fn(time, cell_state, None, cell_output,
+                                          context_state)
+
+      # check if we are done
+      if next_done is None:  # training
+        next_done = time >= sequence_length
+
+      # build next_loop_state
+      if next_context_state is None:
+        next_loop_state = next_done
+      else:
+        next_loop_state = (next_done, next_context_state)
+
+      return (next_done, next_cell_input, next_cell_state,
+              emit_output, next_loop_state)
+
+    # Run raw_rnn function
+    outputs_ta, state, _ = rnn.raw_rnn(
+        cell, loop_fn, parallel_iterations=parallel_iterations,
+        swap_memory=swap_memory, scope=scope)
+    outputs = outputs_ta.pack()
+
+    if not time_major:
+      # [seq, batch, features] -> [batch, seq, features]
+      outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
+    return outputs, state
diff --git a/tensorflow/contrib/session_bundle/session_bundle.cc b/tensorflow/contrib/session_bundle/session_bundle.cc
index 37aadfbc8e..bc6fdcd4de 100644
--- a/tensorflow/contrib/session_bundle/session_bundle.cc
+++ b/tensorflow/contrib/session_bundle/session_bundle.cc
@@ -48,7 +48,7 @@ auto* load_attempt_count = monitoring::Counter<2>::New(
     "model_path", "status");
 auto* load_latency = monitoring::Counter<1>::New(
     "/tensorflow/contrib/session_bundle/load_latency",
-    "Latency in microseconds for SessionBundles that were succesfully loaded.",
+    "Latency in microseconds for SessionBundles that were successfully loaded.",
     "model_path");
 constexpr char kLoadAttemptFail[] = "fail";
 constexpr char kLoadAttemptSuccess[] = "success";
diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index 0f31f6f346..e6100ef675 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -109,7 +109,7 @@ weights = variables.variable('weights',
 Note that in native TensorFlow, there are two types of variables: regular
 variables and local (transient) variables. The vast majority of variables are
 regular variables: once created, they can be saved to disk using a
-[saver](https://www.tensorflow.org/versions/r0.9/api_docs/python/state_ops.html#Saver).
+[saver](https://www.tensorflow.org/versions/r0.11/api_docs/python/state_ops.html#Saver).
 Local variables are those variables that only exist for the duration of a
 session and are not saved to disk.
 
@@ -215,7 +215,7 @@ Dropout| [slim.dropout](https://www.tensorflow.org/code/tensorflow/contrib/layer
 Flatten | [slim.flatten](https://www.tensorflow.org/code/tensorflow/contrib/layers/python/layers/layers.py)
 MaxPool2D | [slim.max_pool2d](https://www.tensorflow.org/code/tensorflow/contrib/layers/python/layers/layers.py)
 OneHotEncoding | [slim.one_hot_encoding](https://www.tensorflow.org/code/tensorflow/contrib/layers/python/layers/layers.py)
-SeperableConv2 | [slim.seperable_conv2d](https://www.tensorflow.org/code/tensorflow/contrib/layers/python/layers/layers.py)
+SeparableConv2 | [slim.separable_conv2d](https://www.tensorflow.org/code/tensorflow/contrib/layers/python/layers/layers.py)
 UnitNorm | [slim.unit_norm](https://www.tensorflow.org/code/tensorflow/contrib/layers/python/layers/layers.py)
 
 TF-Slim also provides two meta-operations called `repeat` and `stack` that
@@ -901,7 +901,7 @@ slim.evaluation.evaluation_loop(
     log_dir,
     num_evals=num_batches,
     eval_op=names_to_updates.values(),
-    summary_op=tf.merge_summary(summary_ops),
+    summary_op=tf.summary.merge(summary_ops),
     eval_interval_secs=eval_interval_secs)
 ```
 
diff --git a/tensorflow/contrib/slim/python/slim/evaluation.py b/tensorflow/contrib/slim/python/slim/evaluation.py
index b66b33ed2d..b89eca46ea 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation.py
@@ -283,5 +283,3 @@ def evaluation_loop(master,
       config=session_config,
       max_number_of_evaluations=max_number_of_evaluations,
       timeout=timeout)
-
-
diff --git a/tensorflow/contrib/slim/python/slim/learning_test.py b/tensorflow/contrib/slim/python/slim/learning_test.py
index 42949e2c28..7225ab86c4 100644
--- a/tensorflow/contrib/slim/python/slim/learning_test.py
+++ b/tensorflow/contrib/slim/python/slim/learning_test.py
@@ -625,7 +625,7 @@ class TrainTest(tf.test.TestCase):
       tf.set_random_seed(2)
       train_op = self.create_train_op()
 
-      model_variables = tf.all_variables()
+      model_variables = tf.global_variables()
       model_path = os.path.join(logdir1, 'model.ckpt-300')
 
       init_op = tf.global_variables_initializer()
@@ -674,7 +674,7 @@ class TrainTest(tf.test.TestCase):
       tf.set_random_seed(2)
       train_op = self.create_train_op()
 
-      model_variables = tf.all_variables()
+      model_variables = tf.global_variables()
       model_path = os.path.join(logdir1, 'model.ckpt-300')
       saver = tf.train.Saver(model_variables)
       def RestoreFn(sess):
diff --git a/tensorflow/contrib/slim/python/slim/model_analyzer.py b/tensorflow/contrib/slim/python/slim/model_analyzer.py
index e29c7b1d8c..74617928a7 100644
--- a/tensorflow/contrib/slim/python/slim/model_analyzer.py
+++ b/tensorflow/contrib/slim/python/slim/model_analyzer.py
@@ -84,7 +84,7 @@ def analyze_vars(variables, print_info=False):
   """Prints the names and shapes of the variables.
 
   Args:
-    variables: list of variables, for example tf.all_variables().
+    variables: list of variables, for example tf.global_variables().
     print_info: Optional, if true print variables and their shape.
 
   Returns:
diff --git a/tensorflow/contrib/specs/python/specs_test.py b/tensorflow/contrib/specs/python/specs_test.py
index d0e650bc5f..67e4a559a9 100644
--- a/tensorflow/contrib/specs/python/specs_test.py
+++ b/tensorflow/contrib/specs/python/specs_test.py
@@ -197,7 +197,7 @@ class SpecsTest(tf.test.TestCase):
                 initializer=tf.constant_initializer(42.0))
       inputs = tf.constant(_rand(10, 100))
       outputs = v.funcall(inputs)
-      self.assertEqual(len(tf.all_variables()), 1)
+      self.assertEqual(len(tf.global_variables()), 1)
       sess.run([outputs.initializer])
       outputs_value = outputs.eval()
       self.assertEqual(outputs_value.shape, (2, 2))
@@ -211,7 +211,7 @@ class SpecsTest(tf.test.TestCase):
         g = f | f | f | f
       inputs = tf.constant(_rand(10, 100))
       _ = g.funcall(inputs)
-      self.assertEqual(len(tf.all_variables()), 2)
+      self.assertEqual(len(tf.global_variables()), 2)
 
   def testAutoFunction(self):
     with self.test_session():
diff --git a/tensorflow/contrib/stat_summarizer/python/stat_summarizer_test.py b/tensorflow/contrib/stat_summarizer/python/stat_summarizer_test.py
index 616be81e27..a84e11b043 100644
--- a/tensorflow/contrib/stat_summarizer/python/stat_summarizer_test.py
+++ b/tensorflow/contrib/stat_summarizer/python/stat_summarizer_test.py
@@ -34,7 +34,7 @@ class StatSummarizerTest(tf.test.TestCase):
           graph_def.SerializeToString())
 
       with self.test_session() as sess:
-        sess.run(tf.initialize_all_variables())
+        sess.run(tf.global_variables_initializer())
 
         for _ in range(20):
           run_metadata = tf.RunMetadata()
diff --git a/tensorflow/contrib/tensorboard/plugins/projector/projector_api_test.py b/tensorflow/contrib/tensorboard/plugins/projector/projector_api_test.py
index 27093440f8..6bb310db3e 100644
--- a/tensorflow/contrib/tensorboard/plugins/projector/projector_api_test.py
+++ b/tensorflow/contrib/tensorboard/plugins/projector/projector_api_test.py
@@ -31,7 +31,7 @@ class ProjectorApiTest(tf.test.TestCase):
     # Create a dummy configuration.
     config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig()
     config.model_checkpoint_path = 'test'
-    emb1 = config.embedding.add()
+    emb1 = config.embeddings.add()
     emb1.tensor_name = 'tensor1'
     emb1.metadata_path = 'metadata1'
 
@@ -47,3 +47,7 @@ class ProjectorApiTest(tf.test.TestCase):
       config2 = tf.contrib.tensorboard.plugins.projector.ProjectorConfig()
       text_format.Parse(f.read(), config2)
       self.assertEqual(config, config2)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/contrib/training/python/training/device_setter.py b/tensorflow/contrib/training/python/training/device_setter.py
index ae6ffb8f28..53f6fa5436 100644
--- a/tensorflow/contrib/training/python/training/device_setter.py
+++ b/tensorflow/contrib/training/python/training/device_setter.py
@@ -42,7 +42,7 @@ class GreedyLoadBalancingStrategy(object):
   off CPU-intensive ops with RAM-intensive ops with network bandwidth.
 
   This class is intended to be used as a `ps_strategy` in
-  `tf.replica_device_setter`.
+  `tf.train.replica_device_setter`.
   """
 
   def __init__(self, num_tasks, load_fn):
diff --git a/tensorflow/contrib/training/python/training/evaluation_test.py b/tensorflow/contrib/training/python/training/evaluation_test.py
index 927f6ab75a..3d83aec94e 100644
--- a/tensorflow/contrib/training/python/training/evaluation_test.py
+++ b/tensorflow/contrib/training/python/training/evaluation_test.py
@@ -51,7 +51,7 @@ class CheckpointIteratorTest(tf.test.TestCase):
     saver = tf.train.Saver()  # Saves the global step.
 
     with self.test_session() as session:
-      session.run(tf.initialize_all_variables())
+      session.run(tf.global_variables_initializer())
       save_path = os.path.join(checkpoint_dir, 'model.ckpt')
       saver.save(session, save_path, global_step=global_step)
 
@@ -81,7 +81,7 @@ class CheckpointIteratorTest(tf.test.TestCase):
         target='',
         config=tf.ConfigProto(device_count={'CPU': 2})) as session:
 
-      session.run(tf.initialize_all_variables())
+      session.run(tf.global_variables_initializer())
       save_path = os.path.join(checkpoint_dir, 'model.ckpt')
       saver.save(session, save_path, global_step=global_step)
 
diff --git a/tensorflow/contrib/training/python/training/training_test.py b/tensorflow/contrib/training/python/training/training_test.py
index 918c1da018..c0e79aa798 100644
--- a/tensorflow/contrib/training/python/training/training_test.py
+++ b/tensorflow/contrib/training/python/training/training_test.py
@@ -310,7 +310,7 @@ class TrainTest(tf.test.TestCase):
       tf.set_random_seed(2)
       train_op = self.create_train_op()
 
-      model_variables = tf.all_variables()
+      model_variables = tf.global_variables()
       model_path = os.path.join(logdir1, 'model.ckpt-300')
 
       assign_fn = tf.contrib.framework.assign_from_checkpoint_fn(
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index fe40c691c5..991fc2f29d 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -949,12 +949,12 @@ cc_library(
 # Libraries with GPU facilities that are useful for writing kernels.
 cc_library(
     name = "gpu_lib",
-    srcs = if_not_windows([
+    srcs = [
         "common_runtime/gpu/gpu_event_mgr.cc",
-    ]),
-    hdrs = if_not_windows([
+    ],
+    hdrs = [
         "common_runtime/gpu/gpu_event_mgr.h",
-    ]),
+    ],
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
@@ -964,7 +964,8 @@ cc_library(
         ":lib_internal",
         ":proto_text",
         ":protos_all_cc",
-    ] + if_not_windows([":stream_executor"]),
+        ":stream_executor",
+    ],
 )
 
 cc_library(
@@ -982,7 +983,7 @@ tf_proto_library_cc(
     name = "worker_proto",
     srcs = ["protobuf/worker.proto"],
     cc_api_version = 2,
-    cc_libs = [":protos_all_cc"],
+    protodeps = [":protos_all"],
     visibility = [
         "//tensorflow:internal",
     ],
@@ -993,8 +994,8 @@ tf_proto_library_cc(
     srcs = ["protobuf/worker_service.proto"],
     has_services = 1,
     cc_api_version = 2,
-    cc_libs = [":worker_proto_cc"],
     cc_stubby_versions = ["2"],
+    protodeps = [":worker_proto"],
     visibility = [
         "//tensorflow:internal",
     ],
@@ -1004,7 +1005,7 @@ tf_proto_library_cc(
     name = "master_proto",
     srcs = ["protobuf/master.proto"],
     cc_api_version = 2,
-    cc_libs = [":protos_all_cc"],
+    protodeps = [":protos_all"],
     visibility = [
         "//tensorflow:internal",
     ],
@@ -1015,8 +1016,8 @@ tf_proto_library_cc(
     srcs = ["protobuf/master_service.proto"],
     has_services = 1,
     cc_api_version = 2,
-    cc_libs = [":master_proto_cc"],
     cc_stubby_versions = ["2"],
+    protodeps = [":master_proto"],
     visibility = [
         "//tensorflow:internal",
     ],
@@ -1417,7 +1418,7 @@ tf_cuda_library(
 
 tf_cuda_library(
     name = "gpu_runtime",
-    srcs = if_not_windows([
+    srcs = [
         "common_runtime/gpu/gpu_bfc_allocator.cc",
         "common_runtime/gpu/gpu_debug_allocator.cc",
         "common_runtime/gpu/gpu_device.cc",
@@ -1429,8 +1430,8 @@ tf_cuda_library(
         "common_runtime/gpu/pool_allocator.cc",
         "common_runtime/gpu/process_state.cc",
         "common_runtime/gpu_device_context.h",
-    ]),
-    hdrs = if_not_windows([
+    ],
+    hdrs = [
         "common_runtime/gpu/gpu_bfc_allocator.h",
         "common_runtime/gpu/gpu_debug_allocator.h",
         "common_runtime/gpu/gpu_device.h",
@@ -1439,7 +1440,7 @@ tf_cuda_library(
         "common_runtime/gpu/gpu_util.h",
         "common_runtime/gpu/pool_allocator.h",
         "common_runtime/gpu/process_state.h",
-    ]),
+    ],
     copts = tf_copts(),
     linkstatic = 1,
     deps = [
@@ -1451,10 +1452,9 @@ tf_cuda_library(
         ":lib",
         ":lib_internal",
         ":protos_all_cc",
-        "//third_party/eigen3",
-    ] + if_not_windows([
         ":stream_executor",
-    ]),
+        "//third_party/eigen3",
+    ],
     alwayslink = 1,
 )
 
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 44f17d6260..4b0165bae7 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -835,7 +835,7 @@ static void TestSessionInterOpThreadsImpl(bool use_function_lib) {
   FunctionLibraryDefinition flib(OpRegistry::Global(), library_graph_def);
   Graph g(&flib);
   Tensor t(DT_FLOAT, TensorShape({}));
-  t.scalar<float>()() = {1.2};
+  t.scalar<float>()() = {1.2f};
   Node* x = test::graph::Constant(&g, t);
   Node* y;
   if (use_function_lib) {
@@ -945,7 +945,7 @@ TEST(DirectSessionTest, TestSessionInterOpThreadsWithFunctions) {
 TEST(DirectSessionTest, TestSessionInterOpThreadsInvalidOptions) {
   Graph g(OpRegistry::Global());
   Tensor t(DT_FLOAT, TensorShape({}));
-  t.scalar<float>()() = {1.2};
+  t.scalar<float>()() = {1.2f};
   Node* x = test::graph::Constant(&g, t);
   GraphDef def;
   test::graph::ToGraphDef(&g, &def);
@@ -979,7 +979,7 @@ TEST(DirectSessionTest, TestDirectSessionRunClose) {
   // Construct a graph with a variable and a single assign.
   Graph g(OpRegistry::Global());
   Tensor t(DT_FLOAT, TensorShape({}));
-  t.scalar<float>()() = {1.2};
+  t.scalar<float>()() = {1.2f};
   Node* var_val = test::graph::Constant(&g, t);
   Node* var = test::graph::Var(&g, DT_FLOAT, {});
   Node* var_assign = test::graph::Assign(&g, var, var_val);
@@ -1063,7 +1063,7 @@ TEST(DirectSessionTest, TestDirectSessionReset) {
   // Construct a graph with a variable and a single assign.
   Graph g(OpRegistry::Global());
   Tensor t(DT_FLOAT, TensorShape({}));
-  t.scalar<float>()() = {1.2};
+  t.scalar<float>()() = {1.2f};
   Node* var_val = test::graph::Constant(&g, t);
   Node* var = test::graph::Var(&g, DT_FLOAT, {});
   Node* var_assign = test::graph::Assign(&g, var, var_val);
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index 8cca22fb6f..239c9666e3 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -39,7 +39,7 @@ class StepStatsCollector;
 //   Rendezvous* rendezvous = NewNaiveRendezvous();
 //   TF_CHECK_OK(rendezvous->Send("input", some_input_tensor));
 //   TF_CHECK_OK(executor->Run({ExecutorOpts, rendezvous, nullptr}));
-//   TF_CHECK_OK(rendezvous->Recv("input", &output_tensor));
+//   TF_CHECK_OK(rendezvous->Recv("output", &output_tensor));
 //   ... ...
 //
 // Multiple threads can call Executor::Run concurrently.
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
index 175b784825..699b54f345 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
@@ -19,16 +19,26 @@ limitations under the License.
 
 namespace tensorflow {
 
-SYCLAllocator::~SYCLAllocator() { }
+SYCLAllocator::~SYCLAllocator() {}
 
 string SYCLAllocator::Name() { return "device:SYCL"; }
 
 void *SYCLAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
+  assert(device_);
   auto p = device_->allocate(num_bytes);
   return p;
 }
 
-void SYCLAllocator::DeallocateRaw(void *ptr) { device_->deallocate(ptr); }
+void SYCLAllocator::DeallocateRaw(void *ptr) {
+  if (device_) {
+    device_->deallocate(ptr);
+  }
+}
+
+void SYCLAllocator::EnterLameDuckMode() {
+  device_->deallocate_all();
+  device_ = nullptr;
+}
 
 } // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.h b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
index 887c727f6e..8558b6c873 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
@@ -29,14 +29,16 @@ namespace tensorflow {
 
 class SYCLAllocator : public Allocator {
 public:
-  SYCLAllocator(Eigen::SyclDevice* device) : device_(device) {}
+  SYCLAllocator(Eigen::QueueInterface* device) : device_(device) {}
   virtual ~SYCLAllocator() override;
   string Name() override;
   void *AllocateRaw(size_t alignment, size_t num_bytes) override;
   void DeallocateRaw(void *ptr) override;
 
+  void EnterLameDuckMode();
+  virtual bool ShouldAllocateEmptyTensors() override final { return true; }
 private:
-  Eigen::SyclDevice *device_;  // not owned
+  Eigen::QueueInterface *device_;  // not owned
   TF_DISALLOW_COPY_AND_ASSIGN(SYCLAllocator);
 };
 
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.cc b/tensorflow/core/common_runtime/sycl/sycl_device.cc
index 10a037c02d..e5fe85bcf5 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.cc
@@ -25,8 +25,9 @@ namespace tensorflow {
 
 SYCLDevice::~SYCLDevice() {
   device_context_->Unref();
-  delete sycl_allocator_;
+  sycl_allocator_->EnterLameDuckMode();
   delete sycl_device_;
+  delete sycl_queue_;
 }
 
 void SYCLDevice::Compute(OpKernel *op_kernel, OpKernelContext *context) {
@@ -50,12 +51,8 @@ Allocator *SYCLDevice::GetAllocator(AllocatorAttributes attr) {
 Status SYCLDevice::MakeTensorFromProto(const TensorProto &tensor_proto,
                                        const AllocatorAttributes alloc_attrs,
                                        Tensor *tensor) {
-  AllocatorAttributes attr;
-  attr.set_on_host(true);
-  attr.set_gpu_compatible(true);
-  Allocator *host_alloc = GetAllocator(attr);
   Tensor parsed(tensor_proto.dtype());
-  if (!parsed.FromProto(host_alloc, tensor_proto)) {
+  if (!parsed.FromProto(cpu_allocator_, tensor_proto)) {
     return errors::InvalidArgument("Cannot parse tensor from proto: ",
                                    tensor_proto.DebugString());
   }
@@ -86,6 +83,12 @@ Status SYCLDevice::FillContextMap(const Graph *graph,
   return Status::OK();
 }
 
+Status SYCLDevice::Sync() {
+  sycl_device_->synchronize();
+  return Status::OK();
+}
+
+
 } // namespace tensorflow
 
 #endif // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.h b/tensorflow/core/common_runtime/sycl/sycl_device.h
index d3b3db2a71..2759053df5 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.h
@@ -22,7 +22,6 @@ limitations under the License.
 
 #define EIGEN_USE_SYCL
 
-#include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/common_runtime/sycl/sycl_allocator.h"
 #include "tensorflow/core/common_runtime/sycl/sycl_device_context.h"
@@ -30,7 +29,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-
 class SYCLDevice : public LocalDevice {
 public:
   template <typename SYCLSelector>
@@ -42,8 +40,9 @@ public:
                     name, DEVICE_SYCL, memory_limit, locality,
                     physical_device_desc), nullptr),
         cpu_allocator_(cpu_allocator),
-        sycl_device_(new Eigen::SyclDevice(sycl_selector)),
-        sycl_allocator_(new SYCLAllocator(sycl_device_)),
+        sycl_queue_(new Eigen::QueueInterface(sycl_selector)),
+        sycl_device_(new Eigen::SyclDevice(sycl_queue_)),
+        sycl_allocator_(new SYCLAllocator(sycl_queue_)),
         device_context_(new SYCLDeviceContext()) {
     set_eigen_sycl_device(sycl_device_);
   }
@@ -59,16 +58,17 @@ public:
   Status FillContextMap(const Graph *graph,
                         DeviceContextMap *device_context_map) override;
 
-  Status Sync() override { return Status::OK(); }
+  Status Sync() override;
   static string GetShortDeviceDescription(/*int device_id,
                                           const DeviceDescription& desc*/) {
     return strings::StrCat("device: 0, name SYCL, pci bus id: 0");
   }
 
 private:
-  Allocator *cpu_allocator_;         // owned
-  Eigen::SyclDevice* sycl_device_;   // owned
-  SYCLAllocator *sycl_allocator_;    // owned
+  Allocator *cpu_allocator_;          // owned
+  Eigen::QueueInterface* sycl_queue_; // owned
+  Eigen::SyclDevice* sycl_device_;    // owned
+  SYCLAllocator *sycl_allocator_;     // owned
   SYCLDeviceContext *device_context_;
 };
 
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_context.cc b/tensorflow/core/common_runtime/sycl/sycl_device_context.cc
index 9dd289bebd..b487d24c20 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device_context.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_context.cc
@@ -16,13 +16,11 @@ limitations under the License.
 #if TENSORFLOW_USE_SYCL
 
 #define EIGEN_USE_SYCL
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #include "tensorflow/core/common_runtime/sycl/sycl_device_context.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 
-#define EIGEN_USE_SYCL
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-
 namespace tensorflow {
 
 void SYCLDeviceContext::CopyCPUTensorToDevice(const Tensor *cpu_tensor,
@@ -108,7 +106,6 @@ void SYCLDeviceContext::CopyDeviceTensorToCPU(const Tensor *device_tensor,
                                               StatusCallback done) {
   const int64 total_bytes = device_tensor->TotalBytes();
   if (total_bytes > 0) {
-    device->eigen_sycl_device()->deallocate_all();
     const void* src_ptr = DMAHelper::base(device_tensor);
     void* dst_ptr = DMAHelper::base(cpu_tensor);
     switch (device_tensor->dtype()) {
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
index 9b8770420c..cf9e349e01 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #if TENSORFLOW_USE_SYCL
 
+#include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/sycl/sycl_device.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index 2363b69390..3e4ab5bc17 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -36,7 +36,7 @@ tf_proto_library_cc(
     has_services = 1,
     cc_api_version = 2,
     cc_grpc_version = 1,
-    cc_libs = ["//tensorflow/core:protos_all_cc"],
+    protodeps = ["//tensorflow/core:protos_all"],
 )
 
 # Depending on this target causes a concrete DebuggerState implementation
diff --git a/tensorflow/core/debug/debug_gateway_test.cc b/tensorflow/core/debug/debug_gateway_test.cc
index 1f6e766663..963cea8419 100644
--- a/tensorflow/core/debug/debug_gateway_test.cc
+++ b/tensorflow/core/debug/debug_gateway_test.cc
@@ -372,9 +372,9 @@ TEST_F(SessionDebugMinusAXTest,
   debug_gateway.SetNodeValueCallback(
       [this, &mu, &val_callback_count, &a_debug_identity_node_name,
        &x_debug_identity_node_name, &y_debug_identity_node_name,
-       &debug_identity_tensor_vals,
-       &callbacks_done](const string& node_name, const int output_slot,
-                        const Tensor& tensor_value, const bool is_ref) {
+       &debug_identity_tensor_vals, &callbacks_done, &kConcurrentRuns](
+           const string& node_name, const int output_slot,
+           const Tensor& tensor_value, const bool is_ref) {
         mutex_lock l(mu);
 
         if (node_name == a_debug_identity_node_name && output_slot == 0) {
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index 41868ce8da..4b5ecaa9b6 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -18,6 +18,12 @@ limitations under the License.
 #include <vector>
 
 #include "grpc++/create_channel.h"
+
+#if defined(PLATFORM_WINDOWS)
+// winsock2.h is used in grpc, so Ws2_32.lib is needed
+#pragma comment(lib,"Ws2_32.lib")
+#endif
+
 #include "tensorflow/core/debug/debug_service.grpc.pb.h"
 #include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/lib/io/path.h"
diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc
index 1ddab1689b..ab020517b0 100644
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@@ -273,7 +273,8 @@ TEST_F(DebugIOUtilsTest, PublishTensorConcurrentlyToPartiallyOverlappingPaths) {
 
   auto fn = [this, &dump_count, &done_count, &mu, &dump_root_base, &dump_roots,
              &dump_file_paths, &wall_time, &tensor_name, &debug_node_name,
-             &kNodeName, &kDebugOpName, &kConcurrentPubs, &all_done]() {
+             &kNodeName, &kDebugOpName, &kConcurrentPubs, &kOutputSlot,
+             &all_done]() {
     // "gumpy" is the shared directory part of the path.
     string dump_root;
     string debug_url;
diff --git a/tensorflow/core/framework/partial_tensor_shape_test.cc b/tensorflow/core/framework/partial_tensor_shape_test.cc
index b008a93c03..23f3d908fb 100644
--- a/tensorflow/core/framework/partial_tensor_shape_test.cc
+++ b/tensorflow/core/framework/partial_tensor_shape_test.cc
@@ -220,7 +220,7 @@ TEST(PartialTensorShapeTest, PartialShapeMergeWith) {
 
 TEST(PartialTensorShapeTest, MakePartialShapeEmpty) {
   // Empty made partial shapes should still be fully defined
-  const int64 dims[0] = {};
+  const int64 dims[1] = {};
   PartialTensorShape shape;
   EXPECT_FALSE(shape.IsFullyDefined());
   TF_ASSERT_OK(PartialTensorShape::MakePartialShape(dims, 0, &shape));
diff --git a/tensorflow/core/framework/tensor_testutil.h b/tensorflow/core/framework/tensor_testutil.h
index 73afca40ac..29b9de5c07 100644
--- a/tensorflow/core/framework/tensor_testutil.h
+++ b/tensorflow/core/framework/tensor_testutil.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_FRAMEWORK_TENSOR_TESTUTIL_H_
 #define TENSORFLOW_FRAMEWORK_TENSOR_TESTUTIL_H_
 
+#include <numeric>
+
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 2315c2ffb6..e99ed9dfa8 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -2342,7 +2342,6 @@ cc_library(
         ":batch_norm_op",
         ":bias_op",
         ":conv_ops",
-        ":depthwise_conv_grad_op",
         ":dilation_ops",
         ":fused_batch_norm_op",
         ":in_topk_op",
@@ -2354,7 +2353,10 @@ cc_library(
         ":softsign_op",
         ":topk_op",
         ":xent_op",
-    ] + if_not_windows([":depthwise_conv_op"]),
+    ] + if_not_windows([
+        ":depthwise_conv_grad_op",
+        ":depthwise_conv_op",
+    ]),
 )
 
 NN_DEPS = [
diff --git a/tensorflow/core/kernels/adjust_contrast_op_test.cc b/tensorflow/core/kernels/adjust_contrast_op_test.cc
index b925dc6883..06fd7ca419 100644
--- a/tensorflow/core/kernels/adjust_contrast_op_test.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op_test.cc
@@ -56,7 +56,7 @@ TEST_F(AdjustContrastOpTest, Simple_1223) {
   TF_EXPECT_OK(InitOp());
   AddInputFromArray<float>(TensorShape({1, 2, 2, 3}),
                            {1, 5, 9, 2, 6, 10, 3, 7, 11, 4, 8, 12});
-  AddInputFromArray<float>(TensorShape({}), {0.2});
+  AddInputFromArray<float>(TensorShape({}), {0.2f});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 2, 3}));
@@ -78,7 +78,7 @@ TEST_F(AdjustContrastOpTest, Big_99x99x3) {
   }
 
   AddInputFromArray<float>(TensorShape({1, 99, 99, 3}), values);
-  AddInputFromArray<float>(TensorShape({}), {0.2});
+  AddInputFromArray<float>(TensorShape({}), {0.2f});
   TF_ASSERT_OK(RunOpKernel());
 }
 
diff --git a/tensorflow/core/kernels/batch_norm_op_test.cc b/tensorflow/core/kernels/batch_norm_op_test.cc
index 746b0d46ad..c5e55346eb 100644
--- a/tensorflow/core/kernels/batch_norm_op_test.cc
+++ b/tensorflow/core/kernels/batch_norm_op_test.cc
@@ -47,15 +47,15 @@ TEST_F(BatchNormOpTest, Simple) {
   AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
                            {1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6});
   AddInputFromArray<float>(TensorShape({2}), {10, 20});
-  AddInputFromArray<float>(TensorShape({2}), {0.25, 0.5});
-  AddInputFromArray<float>(TensorShape({2}), {0.1, 0.6});
-  AddInputFromArray<float>(TensorShape({2}), {0.0, 0.0});
+  AddInputFromArray<float>(TensorShape({2}), {0.25f, 0.5f});
+  AddInputFromArray<float>(TensorShape({2}), {0.1f, 0.6f});
+  AddInputFromArray<float>(TensorShape({2}), {0.0f, 0.0f});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
   test::FillValues<float>(
-      &expected, {-17.86, -22.00, -15.87, -20.59, -13.87, -19.18, -21.86,
-                  -33.31, -23.85, -34.72, -25.85, -36.13});
+      &expected, {-17.86f, -22.00f, -15.87f, -20.59f, -13.87f, -19.18f, -21.86f,
+                  -33.31f, -23.85f, -34.72f, -25.85f, -36.13f });
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
 }
 
diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc
index ffad7fd02e..5b7529bb8a 100644
--- a/tensorflow/core/kernels/cast_op_test.cc
+++ b/tensorflow/core/kernels/cast_op_test.cc
@@ -49,17 +49,18 @@ class CastOpTest : public OpsTestBase {
     TF_EXPECT_OK(InitOp());
   }
 
-  template <typename IN, typename OUT>
+  template <typename INPUT, typename OUTPUT>
   void CheckCast() {
-    DataType in_type = DataTypeToEnum<IN>::v();
-    DataType out_type = DataTypeToEnum<OUT>::v();
+    DataType in_type = DataTypeToEnum<INPUT>::v();
+    DataType out_type = DataTypeToEnum<OUTPUT>::v();
     MakeOp(in_type, out_type);
-    AddInputFromArray<IN>(TensorShape({1, 2, 2, 1}),
-                          {IN(1), IN(2), IN(3), IN(4)});
+    AddInputFromArray<INPUT>(TensorShape({1, 2, 2, 1}),
+                             {INPUT(1), INPUT(2), INPUT(3), INPUT(4)});
     TF_ASSERT_OK(RunOpKernel());
     Tensor expected(allocator(), out_type, TensorShape({1, 2, 2, 1}));
-    test::FillValues<OUT>(&expected, {OUT(1), OUT(2), OUT(3), OUT(4)});
-    test::ExpectTensorEqual<OUT>(expected, *GetOutput(0));
+    test::FillValues<OUTPUT>(&expected,
+                             {OUTPUT(1), OUTPUT(2), OUTPUT(3), OUTPUT(4)});
+    test::ExpectTensorEqual<OUTPUT>(expected, *GetOutput(0));
   }
 };
 
diff --git a/tensorflow/core/kernels/colorspace_op_test.cc b/tensorflow/core/kernels/colorspace_op_test.cc
index 4719a59b63..943d25a975 100644
--- a/tensorflow/core/kernels/colorspace_op_test.cc
+++ b/tensorflow/core/kernels/colorspace_op_test.cc
@@ -71,7 +71,7 @@ class RGBToHSVOpTest : public OpsTestBase {
 
   void CheckRedMax(DataType data_type) {
     // Test case where red channel dominates
-    AddInputFromArray<T>(TensorShape({3}), {.8, .4, .2});
+    AddInputFromArray<T>(TensorShape({3}), {.8f, .4f, .2f});
     TF_ASSERT_OK(RunOpKernel());
 
     T expected_h = 1. / 6. * .2 / .6;
@@ -85,7 +85,7 @@ class RGBToHSVOpTest : public OpsTestBase {
 
   void CheckGreenMax(DataType data_type) {
     // Test case where green channel dominates
-    AddInputFromArray<T>(TensorShape({3}), {.2, .8, .4});
+    AddInputFromArray<T>(TensorShape({3}), {.2f, .8f, .4f});
     TF_ASSERT_OK(RunOpKernel());
 
     T expected_h = 1. / 6. * (2.0 + (.2 / .6));
@@ -99,7 +99,7 @@ class RGBToHSVOpTest : public OpsTestBase {
 
   void CheckBlueMax(DataType data_type) {
     // Test case where blue channel dominates
-    AddInputFromArray<T>(TensorShape({3}), {.4, .2, .8});
+    AddInputFromArray<T>(TensorShape({3}), {.4f, .2f, .8f});
     TF_ASSERT_OK(RunOpKernel());
 
     T expected_h = 1. / 6. * (4.0 + (.2 / .6));
@@ -112,7 +112,7 @@ class RGBToHSVOpTest : public OpsTestBase {
   }
 
   void CheckNegativeDifference(DataType data_type) {
-    AddInputFromArray<T>(TensorShape({3}), {0, .1, .2});
+    AddInputFromArray<T>(TensorShape({3}), {0, .1f, .2f});
     TF_ASSERT_OK(RunOpKernel());
 
     T expected_h = 1. / 6. * (4.0 + (-.1 / .2));
@@ -220,7 +220,7 @@ class HSVToRGBOpTest : public OpsTestBase {
     TF_ASSERT_OK(RunOpKernel());
 
     Tensor expected(allocator(), data_type, TensorShape({3}));
-    test::FillValues<T>(&expected, {0, .1, .2});
+    test::FillValues<T>(&expected, {0, .1f, .2f});
     test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
   }
 };
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index e92b11efc6..b01263f288 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -113,9 +113,12 @@ REGISTER_GPU_HOST_REF_KERNEL(string);
 #undef REGISTER_GPU_HOST_REF_KERNEL
 
 #if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(          \
-      Name("Switch").Device(DEVICE_SYCL).TypeConstraint<type>("T"), SwitchOp)
+#define REGISTER_SYCL_KERNEL(type)                       \
+  REGISTER_KERNEL_BUILDER(Name("Switch")                 \
+                              .Device(DEVICE_SYCL)       \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("pred"),       \
+                          SwitchOp)
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 #undef REGISTER_SYCL_KERNEL
@@ -219,9 +222,12 @@ REGISTER_GPU_REF_KERNEL(bool);
 #undef REGISTER_GPU_REF_KERNEL
 
 #if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(          \
-  Name("Merge").Device(DEVICE_SYCL).TypeConstraint<type>("T"), MergeOp)
+#define REGISTER_SYCL_KERNEL(type)                        \
+  REGISTER_KERNEL_BUILDER(Name("Merge")                   \
+                              .Device(DEVICE_SYCL)        \
+                              .TypeConstraint<type>("T")  \
+                              .HostMemory("value_index"), \
+                          MergeOp)
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 #undef REGISTER_SYCL_KERNEL
@@ -418,8 +424,12 @@ REGISTER_GPU_HOST_KERNEL(string);
 
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(          \
-	Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"), NextIterationOp)
+  REGISTER_KERNEL_BUILDER(Name("NextIteration")           \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          NextIterationOp)
   REGISTER_SYCL_KERNEL(bool);
   TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 #undef REGISTER_SYCL_KERNEL
diff --git a/tensorflow/core/kernels/control_flow_ops_test.cc b/tensorflow/core/kernels/control_flow_ops_test.cc
index 97f56c392a..301609e04d 100644
--- a/tensorflow/core/kernels/control_flow_ops_test.cc
+++ b/tensorflow/core/kernels/control_flow_ops_test.cc
@@ -85,13 +85,27 @@ class AbortOpTest : public OpsTestBase {
  protected:
 };
 
+#ifdef PLATFORM_WINDOWS
+#define SIGABRT 3
+
+class KilledBySignal {
+ public:
+  explicit KilledBySignal(int signum) : signum_(signum) {}
+  bool operator()(int exit_status) const { return exit_status == signum_; }
+ private:
+  const int signum_;
+};
+#else
+#define KilledBySignal ::testing::KilledBySignal
+#endif
+
 // Pass an error message to the op.
 TEST_F(AbortOpTest, pass_error_msg) {
   TF_ASSERT_OK(NodeDefBuilder("abort_op", "Abort")
                    .Attr("error_msg", "abort_op_test")
                    .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
-  EXPECT_EXIT(RunOpKernel(), ::testing::KilledBySignal(SIGABRT),
+  EXPECT_EXIT(RunOpKernel(), KilledBySignal(SIGABRT),
               "Abort_op intentional failure; abort_op_test");
 }
 
@@ -99,7 +113,7 @@ TEST_F(AbortOpTest, pass_error_msg) {
 TEST_F(AbortOpTest, default_msg) {
   TF_ASSERT_OK(NodeDefBuilder("abort_op", "Abort").Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
-  EXPECT_EXIT(RunOpKernel(), ::testing::KilledBySignal(SIGABRT),
+  EXPECT_EXIT(RunOpKernel(), KilledBySignal(SIGABRT),
               "Abort_op intentional failure; ");
 }
 
diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc
index fbc23b3b6f..8cf1eac41e 100644
--- a/tensorflow/core/kernels/cwise_op_abs.cc
+++ b/tensorflow/core/kernels/cwise_op_abs.cc
@@ -21,6 +21,18 @@ REGISTER5(UnaryOp, CPU, "Abs", functor::abs, float, Eigen::half, double, int32,
 #if !defined(IS_MOBILE_PLATFORM)
 REGISTER2(UnaryOp, CPU, "ComplexAbs", functor::abs, complex64, complex128);
 #endif
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Abs")                                 \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::abs<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER4(UnaryOp, GPU, "Abs", functor::abs, float, Eigen::half, double, int64);
 REGISTER2(UnaryOp, GPU, "ComplexAbs", functor::abs, complex64, complex128);
diff --git a/tensorflow/core/kernels/cwise_op_acos.cc b/tensorflow/core/kernels/cwise_op_acos.cc
index c44c8bc6f6..1d2d815027 100644
--- a/tensorflow/core/kernels/cwise_op_acos.cc
+++ b/tensorflow/core/kernels/cwise_op_acos.cc
@@ -17,6 +17,18 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER2(UnaryOp, CPU, "Acos", functor::acos, float, double);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Acos")                                \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::acos<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Acos", functor::acos, float, double);
 #endif
diff --git a/tensorflow/core/kernels/cwise_op_add_1.cc b/tensorflow/core/kernels/cwise_op_add_1.cc
index 44c552d18e..a6bff78694 100644
--- a/tensorflow/core/kernels/cwise_op_add_1.cc
+++ b/tensorflow/core/kernels/cwise_op_add_1.cc
@@ -26,7 +26,7 @@ REGISTER5(BinaryOp, CPU, "Add", functor::add, float, Eigen::half, double, int32,
                           .Device(DEVICE_SYCL)                        \
                           .TypeConstraint<TYPE>("T"),                 \
                           BinaryOp<SYCLDevice, functor::add<TYPE>>);
-TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
+  REGISTER_SYCL_KERNEL(float);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
           
diff --git a/tensorflow/core/kernels/cwise_op_asin.cc b/tensorflow/core/kernels/cwise_op_asin.cc
index bba20aa6af..92a22e90c4 100644
--- a/tensorflow/core/kernels/cwise_op_asin.cc
+++ b/tensorflow/core/kernels/cwise_op_asin.cc
@@ -17,6 +17,18 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER2(UnaryOp, CPU, "Asin", functor::asin, float, double);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Asin")                                \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::asin<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Asin", functor::asin, float, double);
 #endif
diff --git a/tensorflow/core/kernels/cwise_op_atan.cc b/tensorflow/core/kernels/cwise_op_atan.cc
index 055b8289d4..825e85283f 100644
--- a/tensorflow/core/kernels/cwise_op_atan.cc
+++ b/tensorflow/core/kernels/cwise_op_atan.cc
@@ -17,6 +17,18 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER2(UnaryOp, CPU, "Atan", functor::atan, float, double);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Atan")                                \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::atan<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Atan", functor::atan, float, double);
 #endif
diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc
index 08ac1b4194..c5a4aaf831 100644
--- a/tensorflow/core/kernels/cwise_op_ceil.cc
+++ b/tensorflow/core/kernels/cwise_op_ceil.cc
@@ -17,6 +17,18 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER3(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, double);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Ceil")                                \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::ceil<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Ceil", functor::ceil, float, Eigen::half, double);
 #endif
diff --git a/tensorflow/core/kernels/cwise_op_cos.cc b/tensorflow/core/kernels/cwise_op_cos.cc
index 2680143d65..a758da5842 100644
--- a/tensorflow/core/kernels/cwise_op_cos.cc
+++ b/tensorflow/core/kernels/cwise_op_cos.cc
@@ -18,6 +18,18 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Cos", functor::cos, float, Eigen::half, double,
           complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Cos")                                 \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::cos<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Cos", functor::cos, float, Eigen::half, double);
 #endif
diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc
index c2b05a69b2..ef8c477e48 100644
--- a/tensorflow/core/kernels/cwise_op_div.cc
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@@ -30,6 +30,11 @@ REGISTER5(BinaryOp, CPU, "RealDiv", functor::div, float, Eigen::half, double,
                           Name("Div")                                 \
                           .Device(DEVICE_SYCL)                        \
                           .TypeConstraint<TYPE>("T"),                 \
+                          BinaryOp<SYCLDevice, functor::div<TYPE>>);  \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("RealDiv")                             \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
                           BinaryOp<SYCLDevice, functor::div<TYPE>>);
 REGISTER_SYCL_KERNEL(float)
 #undef REGISTER_SYCL_KERNEL
diff --git a/tensorflow/core/kernels/cwise_op_exp.cc b/tensorflow/core/kernels/cwise_op_exp.cc
index 7ec3526282..0ee47f7dee 100644
--- a/tensorflow/core/kernels/cwise_op_exp.cc
+++ b/tensorflow/core/kernels/cwise_op_exp.cc
@@ -18,6 +18,18 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Exp", functor::exp, float, Eigen::half, double,
           complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Exp")                                 \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::exp<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Exp", functor::exp, float, Eigen::half, double);
 #endif
diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc
index 732087d4cb..129d754b82 100644
--- a/tensorflow/core/kernels/cwise_op_floor.cc
+++ b/tensorflow/core/kernels/cwise_op_floor.cc
@@ -17,6 +17,18 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER3(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, double);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Floor")                               \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::floor<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Floor", functor::floor, float, Eigen::half, double);
 #endif
diff --git a/tensorflow/core/kernels/cwise_op_floor_div.cc b/tensorflow/core/kernels/cwise_op_floor_div.cc
index a5767476c3..69dbb70b83 100644
--- a/tensorflow/core/kernels/cwise_op_floor_div.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_div.cc
@@ -18,6 +18,9 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8, uint16,
           int16, int32, int64);
+REGISTER3(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float,
+          Eigen::half, double);
+
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                    \
   REGISTER_KERNEL_BUILDER(                                            \
@@ -25,11 +28,10 @@ REGISTER5(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8, uint16,
                           .Device(DEVICE_SYCL)                        \
                           .TypeConstraint<TYPE>("T"),                 \
                           BinaryOp<SYCLDevice, functor::floor_div<TYPE>>);
-TF_CALL_INTEGRAL_TYPES(REGISTER_SYCL_KERNEL);
+REGISTER_SYCL_KERNEL(float)
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
-REGISTER3(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float,
-          Eigen::half, double);
+
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "FloorDiv", functor::floor_div, uint8, uint16, int16,
           int64);
diff --git a/tensorflow/core/kernels/cwise_op_isfinite.cc b/tensorflow/core/kernels/cwise_op_isfinite.cc
index e38b271318..59976141c7 100644
--- a/tensorflow/core/kernels/cwise_op_isfinite.cc
+++ b/tensorflow/core/kernels/cwise_op_isfinite.cc
@@ -18,6 +18,7 @@ limitations under the License.
 namespace tensorflow {
 REGISTER3(UnaryOp, CPU, "IsFinite", functor::isfinite, float, Eigen::half,
           double);
+
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                    \
   REGISTER_KERNEL_BUILDER(                                            \
@@ -25,9 +26,10 @@ REGISTER3(UnaryOp, CPU, "IsFinite", functor::isfinite, float, Eigen::half,
                           .Device(DEVICE_SYCL)                        \
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::isfinite<TYPE>>);
-TF_CALL_REAL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
+REGISTER_SYCL_KERNEL(float);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "IsFinite", functor::isfinite, float, Eigen::half,
           double);
diff --git a/tensorflow/core/kernels/cwise_op_isinf.cc b/tensorflow/core/kernels/cwise_op_isinf.cc
index bf056dbe0e..675cb95b95 100644
--- a/tensorflow/core/kernels/cwise_op_isinf.cc
+++ b/tensorflow/core/kernels/cwise_op_isinf.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER3(UnaryOp, CPU, "IsInf", functor::isinf, float, Eigen::half, double);
+
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                    \
   REGISTER_KERNEL_BUILDER(                                            \
@@ -24,9 +25,10 @@ REGISTER3(UnaryOp, CPU, "IsInf", functor::isinf, float, Eigen::half, double);
                           .Device(DEVICE_SYCL)                        \
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::isinf<TYPE>>);
-TF_CALL_REAL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
+REGISTER_SYCL_KERNEL(float);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "IsInf", functor::isinf, float, Eigen::half, double);
 #endif
diff --git a/tensorflow/core/kernels/cwise_op_isnan.cc b/tensorflow/core/kernels/cwise_op_isnan.cc
index d2bac23882..c394087ed8 100644
--- a/tensorflow/core/kernels/cwise_op_isnan.cc
+++ b/tensorflow/core/kernels/cwise_op_isnan.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER3(UnaryOp, CPU, "IsNan", functor::isnan, float, Eigen::half, double);
+
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                    \
   REGISTER_KERNEL_BUILDER(                                            \
@@ -24,9 +25,10 @@ REGISTER3(UnaryOp, CPU, "IsNan", functor::isnan, float, Eigen::half, double);
                           .Device(DEVICE_SYCL)                        \
                           .TypeConstraint<TYPE>("T"),                 \
                           UnaryOp<SYCLDevice, functor::isnan<TYPE>>);
-TF_CALL_REAL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
+REGISTER_SYCL_KERNEL(float);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "IsNan", functor::isnan, float, Eigen::half, double);
 #endif
diff --git a/tensorflow/core/kernels/cwise_op_log.cc b/tensorflow/core/kernels/cwise_op_log.cc
index be184f03de..71c4588b3d 100644
--- a/tensorflow/core/kernels/cwise_op_log.cc
+++ b/tensorflow/core/kernels/cwise_op_log.cc
@@ -18,6 +18,18 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Log", functor::log, float, Eigen::half, double,
           complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Log")                                 \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::log<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Log", functor::log, float, Eigen::half, double);
 #endif
diff --git a/tensorflow/core/kernels/cwise_op_log1p.cc b/tensorflow/core/kernels/cwise_op_log1p.cc
index 91a14989e6..03ea3a0a89 100644
--- a/tensorflow/core/kernels/cwise_op_log1p.cc
+++ b/tensorflow/core/kernels/cwise_op_log1p.cc
@@ -18,6 +18,18 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Log1p", functor::log1p, float, Eigen::half, double,
           complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Log1p")                               \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::log1p<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Log1p", functor::log1p, float, Eigen::half, double);
 #endif
diff --git a/tensorflow/core/kernels/cwise_op_neg.cc b/tensorflow/core/kernels/cwise_op_neg.cc
index 67b088e110..4221fc0710 100644
--- a/tensorflow/core/kernels/cwise_op_neg.cc
+++ b/tensorflow/core/kernels/cwise_op_neg.cc
@@ -18,6 +18,18 @@ limitations under the License.
 namespace tensorflow {
 REGISTER7(UnaryOp, CPU, "Neg", functor::neg, float, Eigen::half, double, int32,
           complex64, int64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Neg")                                 \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::neg<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER4(UnaryOp, GPU, "Neg", functor::neg, float, Eigen::half, double, int64);
 
diff --git a/tensorflow/core/kernels/cwise_op_pow.cc b/tensorflow/core/kernels/cwise_op_pow.cc
index dd28b36519..8eeba6ab14 100644
--- a/tensorflow/core/kernels/cwise_op_pow.cc
+++ b/tensorflow/core/kernels/cwise_op_pow.cc
@@ -18,6 +18,18 @@ limitations under the License.
 namespace tensorflow {
 REGISTER7(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, double, int32,
           int64, complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Pow")                                 \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          BinaryOp<SYCLDevice, functor::pow<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "Pow", functor::pow, float, Eigen::half, double,
           int64);
diff --git a/tensorflow/core/kernels/cwise_op_rsqrt.cc b/tensorflow/core/kernels/cwise_op_rsqrt.cc
index 3207166e94..7dc96d47a6 100644
--- a/tensorflow/core/kernels/cwise_op_rsqrt.cc
+++ b/tensorflow/core/kernels/cwise_op_rsqrt.cc
@@ -18,6 +18,18 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Rsqrt", functor::rsqrt, float, Eigen::half, double,
           complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Rsqrt")                               \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::rsqrt<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Rsqrt", functor::rsqrt, float, Eigen::half, double);
 #endif
diff --git a/tensorflow/core/kernels/cwise_op_sin.cc b/tensorflow/core/kernels/cwise_op_sin.cc
index 1e3880beb1..8d0c0959f7 100644
--- a/tensorflow/core/kernels/cwise_op_sin.cc
+++ b/tensorflow/core/kernels/cwise_op_sin.cc
@@ -18,6 +18,18 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Sin", functor::sin, float, Eigen::half, double,
           complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Sin")                                 \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::sin<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYC
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Sin", functor::sin, float, Eigen::half, double);
 #endif
diff --git a/tensorflow/core/kernels/cwise_op_sqrt.cc b/tensorflow/core/kernels/cwise_op_sqrt.cc
index aecffda4ba..710001517b 100644
--- a/tensorflow/core/kernels/cwise_op_sqrt.cc
+++ b/tensorflow/core/kernels/cwise_op_sqrt.cc
@@ -18,6 +18,18 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Sqrt", functor::sqrt, float, Eigen::half, double,
           complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Sqrt")                                \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::sqrt<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYC
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Sqrt", functor::sqrt, float, Eigen::half, double);
 #endif
diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc
index 0ce4473d83..f867f127a7 100644
--- a/tensorflow/core/kernels/cwise_op_square.cc
+++ b/tensorflow/core/kernels/cwise_op_square.cc
@@ -18,6 +18,18 @@ limitations under the License.
 namespace tensorflow {
 REGISTER7(UnaryOp, CPU, "Square", functor::square, float, Eigen::half, double,
           int32, int64, complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Square")                              \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::square<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYC
+
 #if GOOGLE_CUDA
 REGISTER4(UnaryOp, GPU, "Square", functor::square, float, Eigen::half, double,
           int64);
diff --git a/tensorflow/core/kernels/cwise_op_sub.cc b/tensorflow/core/kernels/cwise_op_sub.cc
index ed78ba37a8..e1326dbed1 100644
--- a/tensorflow/core/kernels/cwise_op_sub.cc
+++ b/tensorflow/core/kernels/cwise_op_sub.cc
@@ -31,7 +31,7 @@ REGISTER(BinaryOp, CPU, "Sub", functor::sub, int32);
                           .Device(DEVICE_SYCL)                        \
                           .TypeConstraint<TYPE>("T"),                 \
                           BinaryOp<SYCLDevice, functor::sub<TYPE>>);
-TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
+  REGISTER_SYCL_KERNEL(float);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_tan.cc b/tensorflow/core/kernels/cwise_op_tan.cc
index fca1addfa4..ac49cad88f 100644
--- a/tensorflow/core/kernels/cwise_op_tan.cc
+++ b/tensorflow/core/kernels/cwise_op_tan.cc
@@ -17,6 +17,18 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER2(UnaryOp, CPU, "Tan", functor::tan, float, double);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Tan")                                 \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::tan<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYC
+
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Tan", functor::tan, float, double);
 #endif
diff --git a/tensorflow/core/kernels/cwise_op_tanh.cc b/tensorflow/core/kernels/cwise_op_tanh.cc
index a4c4aad053..ae2c473e20 100644
--- a/tensorflow/core/kernels/cwise_op_tanh.cc
+++ b/tensorflow/core/kernels/cwise_op_tanh.cc
@@ -19,6 +19,18 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Tanh", functor::tanh, float, Eigen::half, double,
           complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Tanh")                                \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::tanh<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYC
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Tanh", functor::tanh, float, Eigen::half, double);
 #endif
diff --git a/tensorflow/core/kernels/cwise_ops_sycl_common.h b/tensorflow/core/kernels/cwise_ops_sycl_common.h
index 4c22cc4855..3fcf0759d4 100644
--- a/tensorflow/core/kernels/cwise_ops_sycl_common.h
+++ b/tensorflow/core/kernels/cwise_ops_sycl_common.h
@@ -21,12 +21,10 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_CWISE_OPS_SYCL_COMMON_H_
 
 #define EIGEN_USE_SYCL
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #include "tensorflow/core/framework/register_types.h"
-
-#include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/cwise_ops.h"
-#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -62,14 +60,14 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
   void operator()(const SYCLDevice& d, typename Functor::tout_type out,
                   typename Functor::tin_type in0,
                   typename Functor::tin_type in1, bool* error) {
-    To32Bit(out).device(d) = To32Bit(in0).binaryExpr(in1, typename Functor::func());
+    To32Bit(out).device(d) = To32Bit(in0).binaryExpr(To32Bit(in1), typename Functor::func());
   }
 
   void Left(const SYCLDevice& d, typename Functor::tout_type out,
             typename Functor::tscalar_type scalar,
             typename Functor::tin_type in, bool* error) {
     typedef typename Functor::func Binary;
-    constexpr int NumDims = Functor::tin_type::NumDimensions; 
+    constexpr int NumDims = Functor::tin_type::NumDimensions;
     typedef typename Functor::tin_type::Scalar T;
     typedef typename Functor::tin_type::Index Index;
     Eigen::array<Index, NumDims> scalar_dim = GenerateArrayOfOnes<Index, NumDims>();
diff --git a/tensorflow/core/kernels/debug_ops.cc b/tensorflow/core/kernels/debug_ops.cc
index 1a4d70c36b..78d386a5af 100644
--- a/tensorflow/core/kernels/debug_ops.cc
+++ b/tensorflow/core/kernels/debug_ops.cc
@@ -28,6 +28,16 @@ REGISTER_KERNEL_BUILDER(Name("Copy").Device(DEVICE_CPU), CopyOp);
 
 REGISTER_KERNEL_BUILDER(Name("CopyHost").Device(DEVICE_CPU), CopyOp);
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("Copy").Device(DEVICE_SYCL), CopyOp);
+
+REGISTER_KERNEL_BUILDER(Name("CopyHost")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("input")
+                            .HostMemory("output"),
+                        CopyOp);
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(Name("Copy").Device(DEVICE_GPU), CopyOp);
 
@@ -50,6 +60,14 @@ REGISTER_KERNEL_BUILDER(Name("DebugIdentity")
                         DebugIdentityOp);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("DebugIdentity")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("input")
+                            .HostMemory("output"),
+                        DebugIdentityOp);
+#endif // TENSORFLOW_USE_SYCL
+
 // Register debug NaN-counter (non-ref and ref) ops.
 #define REGISTER_DEBUG_NAN_COUNT(type)                                    \
   REGISTER_KERNEL_BUILDER(                                                \
@@ -70,4 +88,15 @@ REGISTER_GPU_DEBUG_NAN_COUNT(float);
 REGISTER_GPU_DEBUG_NAN_COUNT(double);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_GPU_DEBUG_NAN_COUNT(type)                \
+  REGISTER_KERNEL_BUILDER(Name("DebugNanCount")           \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("input")        \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          DebugNanCountOp<type>);
+REGISTER_GPU_DEBUG_NAN_COUNT(float);
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
index baa8f83091..5216a4b5d0 100644
--- a/tensorflow/core/kernels/dense_update_ops.cc
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -97,13 +97,20 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 
 #if TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#define REGISTER_SYCL_KERNEL(type)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Assign")                              \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<type>("T"),                 \
-                          AssignOpT<SYCLDevice, type>);
-TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
+#define REGISTER_SYCL_KERNEL(type)                                     \
+  REGISTER_KERNEL_BUILDER(                                             \
+                          Name("Assign")                               \
+                          .Device(DEVICE_SYCL)                         \
+                          .TypeConstraint<type>("T"),                  \
+                          AssignOpT<SYCLDevice, type>);                \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("AssignAdd").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      DenseUpdateOp<SYCLDevice, type, DenseUpdateType::ADD>);          \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("AssignSub").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      DenseUpdateOp<SYCLDevice, type, DenseUpdateType::SUB>);
+
+REGISTER_SYCL_KERNEL(float);
 #undef REGISTER_SYCL_KERNEL
 #endif
 
diff --git a/tensorflow/core/kernels/fact_op.cc b/tensorflow/core/kernels/fact_op.cc
index 52ad2d0c1f..f1ab4c4a4d 100644
--- a/tensorflow/core/kernels/fact_op.cc
+++ b/tensorflow/core/kernels/fact_op.cc
@@ -73,25 +73,46 @@ static void E(string* s) {
   }
 }
 
-template <const char* const FACTS[], uint64 N>
 class FactOpKernel : public OpKernel {
  public:
   explicit FactOpKernel(OpKernelConstruction* context) : OpKernel(context) {}
 
-  void Compute(OpKernelContext* context) override {
+  void Compute(OpKernelContext* context) override = 0;
+
+ protected:
+  void Compute(OpKernelContext* context, const char* const facts[],
+               uint64 count) {
     Tensor* output_tensor = NULL;
     OP_REQUIRES_OK(
         context, context->allocate_output(0, TensorShape({}), &output_tensor));
     auto output = output_tensor->template scalar<string>();
 
-    string coded = FACTS[context->env()->NowMicros() % N];
+    string coded = facts[context->env()->NowMicros() % count];
     E(&coded);
     output() = coded;
   }
 };
 
+class FactOpKernel1 : public FactOpKernel {
+ public:
+  FactOpKernel1(OpKernelConstruction* context) : FactOpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    FactOpKernel::Compute(context, kFacts1, kNum1);
+  }
+};
+
+class FactOpKernel2 : public FactOpKernel {
+ public:
+  FactOpKernel2(OpKernelConstruction* context) : FactOpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    FactOpKernel::Compute(context, kFacts2, kNum2);
+  }
+};
+
 REGISTER_KERNEL_BUILDER(Name("Fact").Device(DEVICE_GPU).HostMemory("fact"),
-                        FactOpKernel<kFacts1, kNum1>);
+                        FactOpKernel1);
 
 static string D(const char* s) {
   string ret(s);
@@ -102,10 +123,10 @@ static string D(const char* s) {
 REGISTER_KERNEL_BUILDER(Name("Fact")
                             .Device(DEVICE_CPU)
                             .Label(D("Yoxmos").c_str()),
-                        FactOpKernel<kFacts2, kNum2>);
+                        FactOpKernel2);
 REGISTER_KERNEL_BUILDER(Name("Fact")
                             .Device(DEVICE_CPU)
                             .Label(D("yoxmos").c_str()),
-                        FactOpKernel<kFacts2, kNum2>);
+                        FactOpKernel2);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fused_batch_norm_op_test.cc b/tensorflow/core/kernels/fused_batch_norm_op_test.cc
index c4b942c56f..a3f760b746 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op_test.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op_test.cc
@@ -79,7 +79,7 @@ TEST_F(FusedBatchNormOpTest, Inference) {
   AddInputFromArray<float>(TensorShape({2}), {4.0, 4.0});
   AddInputFromArray<float>(TensorShape({2}), {2.0, 2.0});
   AddInputFromArray<float>(TensorShape({2}), {10, 10});
-  AddInputFromArray<float>(TensorShape({2}), {11.67, 11.67});
+  AddInputFromArray<float>(TensorShape({2}), {11.67f, 11.67f});
 
   TF_ASSERT_OK(RunOpKernel());
 
@@ -106,8 +106,8 @@ TEST_F(FusedBatchNormGradOpTest, Simple) {
   AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
                            {1, 1, 7, 7, 4, 4, -3, -3, -11, -11, 13, 13});
   AddInputFromArray<float>(TensorShape({2}), {4, 4});
-  AddInputFromArray<float>(TensorShape({2}), {1.833, 1.833});
-  AddInputFromArray<float>(TensorShape({2}), {57.472, 57.472});
+  AddInputFromArray<float>(TensorShape({2}), {1.833f, 1.833f});
+  AddInputFromArray<float>(TensorShape({2}), {57.472f, 57.472f});
 
   TF_ASSERT_OK(RunOpKernel());
 
diff --git a/tensorflow/core/kernels/non_max_suppression_op_test.cc b/tensorflow/core/kernels/non_max_suppression_op_test.cc
index 070dd49aef..72e368db77 100644
--- a/tensorflow/core/kernels/non_max_suppression_op_test.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op_test.cc
@@ -45,9 +45,9 @@ class NonMaxSuppressionOpTest : public OpsTestBase {
 TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClusters) {
   MakeOp(.5);
   AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1,  1, 1.1,  0, -0.1, 1, 0.9,
-                            0, 10, 1, 11, 0, 10.1, 1, 11.1, 0, 100,  1, 101});
-  AddInputFromArray<float>(TensorShape({6}), {.9, .75, .6, .95, .5, .3});
+                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {3});
   TF_ASSERT_OK(RunOpKernel());
 
@@ -59,9 +59,9 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClusters) {
 TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClustersFlippedCoordinates) {
   MakeOp(.5);
   AddInputFromArray<float>(TensorShape({6, 4}),
-                           {1, 1,  0, 0,  0, 0.1,  1, 1.1,  0, .9,  1, -0.1,
-                            0, 10, 1, 11, 1, 10.1, 0, 11.1, 1, 101, 0, 100});
-  AddInputFromArray<float>(TensorShape({6}), {.9, .75, .6, .95, .5, .3});
+                           {1, 1,  0, 0,  0, 0.1f,  1, 1.1f,  0, .9f,  1, -0.1f,
+                            0, 10, 1, 11, 1, 10.1f, 0, 11.1f, 1, 101, 0, 100});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {3});
   TF_ASSERT_OK(RunOpKernel());
 
@@ -73,9 +73,9 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClustersFlippedCoordinates) {
 TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostTwoBoxesFromThreeClusters) {
   MakeOp(.5);
   AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1,  1, 1.1,  0, -0.1, 1, 0.9,
-                            0, 10, 1, 11, 0, 10.1, 1, 11.1, 0, 100,  1, 101});
-  AddInputFromArray<float>(TensorShape({6}), {.9, .75, .6, .95, .5, .3});
+                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {2});
   TF_ASSERT_OK(RunOpKernel());
 
@@ -87,9 +87,9 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostTwoBoxesFromThreeClusters) {
 TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostThirtyBoxesFromThreeClusters) {
   MakeOp(.5);
   AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1,  1, 1.1,  0, -0.1, 1, 0.9,
-                            0, 10, 1, 11, 0, 10.1, 1, 11.1, 0, 100,  1, 101});
-  AddInputFromArray<float>(TensorShape({6}), {.9, .75, .6, .95, .5, .3});
+                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {30});
   TF_ASSERT_OK(RunOpKernel());
 
@@ -101,7 +101,7 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostThirtyBoxesFromThreeClusters) {
 TEST_F(NonMaxSuppressionOpTest, TestSelectSingleBox) {
   MakeOp(.5);
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
-  AddInputFromArray<float>(TensorShape({1}), {.9});
+  AddInputFromArray<float>(TensorShape({1}), {.9f});
   AddInputFromArray<int>(TensorShape({}), {3});
   TF_ASSERT_OK(RunOpKernel());
 
@@ -136,9 +136,9 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromTenIdenticalBoxes) {
 TEST_F(NonMaxSuppressionOpTest, TestInconsistentBoxAndScoreShapes) {
   MakeOp(.5);
   AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1,  1, 1.1,  0, -0.1, 1, 0.9,
-                            0, 10, 1, 11, 0, 10.1, 1, 11.1, 0, 100,  1, 101});
-  AddInputFromArray<float>(TensorShape({5}), {.9, .75, .6, .95, .5});
+                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(TensorShape({5}), {.9f, .75f, .6f, .95f, .5f});
   AddInputFromArray<int>(TensorShape({}), {30});
   Status s = RunOpKernel();
 
@@ -151,7 +151,7 @@ TEST_F(NonMaxSuppressionOpTest, TestInconsistentBoxAndScoreShapes) {
 TEST_F(NonMaxSuppressionOpTest, TestInvalidIOUThreshold) {
   MakeOp(1.2);
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
-  AddInputFromArray<float>(TensorShape({1}), {.9});
+  AddInputFromArray<float>(TensorShape({1}), {.9f});
   AddInputFromArray<int>(TensorShape({}), {3});
   Status s = RunOpKernel();
 
diff --git a/tensorflow/core/kernels/resize_bilinear_op_test.cc b/tensorflow/core/kernels/resize_bilinear_op_test.cc
index deb36849e7..66836ff788 100644
--- a/tensorflow/core/kernels/resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op_test.cc
@@ -95,9 +95,10 @@ TEST_F(ResizeBilinearOpTest, TestBilinear2x2To3x3) {
 
   // clang-format off
   test::FillValues<float>(&expected,
-    {1,     5.0/3,   2,
-     7.0/3, 3,       10.0/3,
-     3,     11.0/3,  4});
+    {1,        5.0f / 3,  2,
+     7.0f / 3, 3,         10.0f / 3,
+     3,        11.0f / 3, 4});
+
 
   // clang-format on
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
@@ -206,9 +207,9 @@ TEST_F(ResizeBilinearOpTest, TestBilinear4x4To3x3) {
 
   // clang-format off
   test::FillValues<float>(&expected,
-    {1,       7.0/3, 11.0/3,
-     19.0/3, 23.0/3, 27.0/3,
-     35.0/3, 39.0/3, 43.0/3});
+    {1,        7.0f/3, 11.0f/3,
+     19.0f/3, 23.0f/3, 27.0f/3,
+     35.0f/3, 39.0f/3, 43.0f/3});
 
   // clang-format on
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
@@ -251,8 +252,8 @@ TEST_F(ResizeBilinearOpTest, TestBilinear2x2To3x3Batch2) {
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3, 3, 1}));
   // clang-format off
   test::FillValues<float>(&expected,
-    {1, 5.0/3, 2, 7.0/3, 3, 10.0/3, 3, 11.0/3, 4,
-     1, 5.0/3, 2, 7.0/3, 3, 10.0/3, 3, 11.0/3, 4
+    {1, 5.0f/3, 2, 7.0f/3, 3, 10.0f/3, 3, 11.0f/3, 4,
+     1, 5.0f/3, 2, 7.0f/3, 3, 10.0f/3, 3, 11.0f/3, 4
     });
   // clang-format on
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
@@ -268,15 +269,15 @@ TEST_F(ResizeBilinearOpTest, TestBilinear2x2x2To3x3x2) {
   // clang-format off
   test::FillValues<float>(&expected,
     {
-      1,      -1,
-      5.0/3,  -5.0/3,
-      2,      -2,
-      7.0/3,  -7.0/3,
-      3,      -3,
-      10.0/3, -10.0/3,
-      3,      -3,
-      11.0/3, -11.0/3,
-      4,      -4
+      1,       -1,
+      5.0f/3,  -5.0f/3,
+      2,       -2,
+      7.0f/3,  -7.0f/3,
+      3,       -3,
+      10.0f/3, -10.0f/3,
+      3,       -3,
+      11.0f/3, -11.0f/3,
+      4,       -4
     });
   // clang-format on
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
diff --git a/tensorflow/core/kernels/spacetobatch_benchmark_test.cc b/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
index a9a9bd46b7..a8c4b3746a 100644
--- a/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
+++ b/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
@@ -54,6 +54,8 @@ static Graph* ConstructSpaceToBatchGraph(
   return g;
 }
 
+// The BM_Expand macro is needed for this to build with VC++.
+#define BM_Expand(x) x
 #define BM_SpaceToBatchDev(OP, DEVICE, DTYPE, B, H, W, D, BS, P00, P01, P10,                            \
                            P11)                                                                         \
   static void                                                                                           \
@@ -69,10 +71,10 @@ static Graph* ConstructSpaceToBatchGraph(
   BENCHMARK(                                                                                            \
       BM_##OP##_##DEVICE##_##DTYPE##_##B##_##H##_##W##_##D##_bs##BS##_pad##P00##_##P01##_##P10##_##P11);
 #define BM_SpaceToBatch(OP, ...)                      \
-  BM_SpaceToBatchDev(OP, cpu, DT_FLOAT, __VA_ARGS__); \
-  BM_SpaceToBatchDev(OP, gpu, DT_FLOAT, __VA_ARGS__); \
-  BM_SpaceToBatchDev(OP, cpu, DT_HALF, __VA_ARGS__);  \
-  BM_SpaceToBatchDev(OP, gpu, DT_HALF, __VA_ARGS__);
+  BM_Expand(BM_SpaceToBatchDev(OP, cpu, DT_FLOAT, __VA_ARGS__)); \
+  BM_Expand(BM_SpaceToBatchDev(OP, gpu, DT_FLOAT, __VA_ARGS__)); \
+  BM_Expand(BM_SpaceToBatchDev(OP, cpu, DT_HALF, __VA_ARGS__));  \
+  BM_Expand(BM_SpaceToBatchDev(OP, gpu, DT_HALF, __VA_ARGS__));
 
 BM_SpaceToBatch(SpaceToBatch, 64, 100, 100, 64, 2, 0, 0, 0, 0);
 BM_SpaceToBatch(SpaceToBatch, 64, 100, 100, 1, 2, 0, 0, 0, 0);
diff --git a/tensorflow/core/kernels/sparse_add_op_test.cc b/tensorflow/core/kernels/sparse_add_op_test.cc
index 7baf27c1d0..4cad02bbee 100644
--- a/tensorflow/core/kernels/sparse_add_op_test.cc
+++ b/tensorflow/core/kernels/sparse_add_op_test.cc
@@ -61,8 +61,10 @@ TEST_F(SparseAddOpTest, TwoD_AddSparseTensorWithSelf) {
   // [3   4]
 
   const auto indices_shape = TensorShape({4, 2});
-  const gtl::ArraySlice<int64> indices = {0, 1, 1, 0, 2, 0, 2, 1};
-  const gtl::ArraySlice<int64> shape = {3, 2};
+  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  const gtl::ArraySlice<int64> indices(in);
+  std::initializer_list<int64> sh{ 3, 2 };
+  const gtl::ArraySlice<int64> shape(sh);
 
 #define ADD_TENSOR_INPUT()                                  \
   AddInputFromArray<int64>(indices_shape, indices);         \
@@ -99,8 +101,10 @@ TEST_F(SparseAddOpTest, TwoD_AddSparseTensorWithSelf) {
     DataType val_dtype = tensorflow::DataTypeToEnum<VALTYPE>::value;        \
                                                                             \
     const auto indices_shape = TensorShape({4, 2});                         \
-    const gtl::ArraySlice<int64> indices = {0, 1, 1, 0, 2, 0, 2, 1};        \
-    const gtl::ArraySlice<int64> shape = {3, 2};                            \
+    std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};                \
+    const gtl::ArraySlice<int64> indices(in);                               \
+    std::initializer_list<int64> sh{3, 2};                                  \
+    const gtl::ArraySlice<int64> shape(sh);                                 \
                                                                             \
     AddInputFromArray<int64>(indices_shape, indices);                       \
     AddInputFromArray<VALTYPE>(TensorShape({4}), {1, 2, 3, 4});             \
@@ -154,8 +158,10 @@ RUN_TEST(complex128);
     MakeOp<VALTYPE>();                                                   \
     DataType val_dtype = tensorflow::DataTypeToEnum<VALTYPE>::value;     \
     const auto indices_shape = TensorShape({4, 2});                      \
-    const gtl::ArraySlice<int64> indices = {0, 1, 1, 0, 2, 0, 2, 1};     \
-    const gtl::ArraySlice<int64> shape = {3, 2};                         \
+    std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};             \
+    const gtl::ArraySlice<int64> indices(in);                            \
+    std::initializer_list<int64> sh{3, 2};                               \
+    const gtl::ArraySlice<int64> shape(sh);                              \
                                                                          \
     auto AddSparseTensor = [indices, indices_shape, shape,               \
                             this](bool negate) {                         \
@@ -192,10 +198,10 @@ RUN_TEST(complex128);
   }
 
 RUN_TEST(int64, 1);
-RUN_TEST(float, 1e-3);
-RUN_TEST(double, 1e-3);
-RUN_TEST(complex64, 1e-3);
-RUN_TEST(complex128, 1e-3);
+RUN_TEST(float, 1e-3f);
+RUN_TEST(double, 1e-3f);
+RUN_TEST(complex64, 1e-3f);
+RUN_TEST(complex128, 1e-3f);
 #undef RUN_TEST
 
 }  // namespace
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
index 7ef3070d06..eaf1884243 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
@@ -96,8 +96,10 @@ TEST_F(SparseDenseCDivTest, SameShape) {
   // [2    ]  cdiv [dense: same shape, all 1's]
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
-  const gtl::ArraySlice<int64> indices = {0, 1, 1, 0, 2, 0, 2, 1};
-  const gtl::ArraySlice<int64> shape = {3, 2};
+  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  const gtl::ArraySlice<int64> indices(in);
+  std::initializer_list<int64> sh{ 3, 2 };
+  const gtl::ArraySlice<int64> shape(sh);
 
   // Tensor dense(DT_FLOAT, TensorShape({3, 1}));
   Tensor dense(DT_FLOAT, TensorShape(shape));
@@ -123,8 +125,10 @@ TEST_F(SparseDenseCDivTest, BroadcastDenseSameDims) {
   // [2    ]  cdiv [dense: shape [3,1], all 1's]
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
-  const gtl::ArraySlice<int64> indices = {0, 1, 1, 0, 2, 0, 2, 1};
-  const gtl::ArraySlice<int64> shape = {3, 2};
+  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  const gtl::ArraySlice<int64> indices(in);
+  std::initializer_list<int64> sh{ 3, 2 };
+  const gtl::ArraySlice<int64> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({3, 1}));
   auto dense_flat = dense.flat<float>();
@@ -148,8 +152,10 @@ TEST_F(SparseDenseCDivTest, BroadcastDenseFewerDims) {
   // [2    ]  cdiv [dense: shape [2]]
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
-  const gtl::ArraySlice<int64> indices = {0, 1, 1, 0, 2, 0, 2, 1};
-  const gtl::ArraySlice<int64> shape = {3, 2};
+  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  const gtl::ArraySlice<int64> indices(in);
+  std::initializer_list<int64> sh{ 3, 2 };
+  const gtl::ArraySlice<int64> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({2}));
   auto dense_flat = dense.flat<float>();
@@ -178,8 +184,10 @@ TEST_F(SparseDenseCMulTest, BroadcastDense) {
   // [1   ?]  where ? remains implicitly zero.
   // [1.5 0]
   const auto indices_shape = TensorShape({4, 2});
-  const gtl::ArraySlice<int64> indices = {0, 1, 1, 0, 2, 0, 2, 1};
-  const gtl::ArraySlice<int64> shape = {3, 2};
+  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  const gtl::ArraySlice<int64> indices(in);
+  std::initializer_list<int64> sh{ 3, 2 };
+  const gtl::ArraySlice<int64> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({2}));
   auto dense_flat = dense.flat<float>();
diff --git a/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc b/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
index 2fb78a2a21..110376be42 100644
--- a/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
+++ b/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
@@ -51,8 +51,10 @@ TEST_F(SparseReduceSumOpTest, SimpleReduce) {
   // [3   4]
 
   const auto indices_shape = TensorShape({4, 2});
-  const gtl::ArraySlice<int64> indices = {0, 1, 1, 0, 2, 0, 2, 1};
-  const gtl::ArraySlice<int64> shape = {3, 2};
+  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  const gtl::ArraySlice<int64> indices(in);
+  std::initializer_list<int64> sh{ 3, 2 };
+  const gtl::ArraySlice<int64> shape(sh);
 
   AddInputFromArray<int64>(indices_shape, indices);
   AddInputFromArray<float>(TensorShape({4}), {1, 2, 3, 4});
@@ -91,8 +93,10 @@ TEST_F(SparseReduceSumSparseOpTest, SimpleReduce) {
   // [3   4]
 
   const auto indices_shape = TensorShape({4, 2});
-  const gtl::ArraySlice<int64> indices = {0, 1, 1, 0, 2, 0, 2, 1};
-  const gtl::ArraySlice<int64> shape = {3, 2};
+  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  const gtl::ArraySlice<int64> indices(in);
+  std::initializer_list<int64> sh{ 3, 2 };
+  const gtl::ArraySlice<int64> shape(sh);
 
   AddInputFromArray<int64>(indices_shape, indices);
   AddInputFromArray<float>(TensorShape({4}), {2, 2, 3, 4});
diff --git a/tensorflow/core/kernels/summary_image_op_test.cc b/tensorflow/core/kernels/summary_image_op_test.cc
index 96a4d4183f..f936276925 100644
--- a/tensorflow/core/kernels/summary_image_op_test.cc
+++ b/tensorflow/core/kernels/summary_image_op_test.cc
@@ -126,16 +126,16 @@ TEST_F(SummaryImageOpTest, OneColorImage4dInput) {
   AddInputFromArray<float>(
       TensorShape({1 /*batch*/, 5 /*rows*/, 2 /*columns*/, 3 /*depth*/}),
       {
-          /* r0, c0, RGB */ 1.0, 0.1, 0.2,
-          /* r0, c1, RGB */ 1.0, 0.3, 0.4,
-          /* r1, c0, RGB */ 0.0, 1.0, 0.0,
-          /* r1, c1, RGB */ 0.0, 1.0, 0.0,
-          /* r2, c0, RGB */ 0.0, 0.0, 1.0,
-          /* r2, c1, RGB */ 0.0, 0.0, 1.0,
-          /* r3, c0, RGB */ 1.0, 1.0, 0.0,
-          /* r3, c1, RGB */ 1.0, 0.0, 1.0,
-          /* r4, c0, RGB */ 1.0, 1.0, 0.0,
-          /* r4, c1, RGB */ 1.0, 0.0, 1.0,
+          /* r0, c0, RGB */ 1.0f, 0.1f, 0.2f,
+          /* r0, c1, RGB */ 1.0f, 0.3f, 0.4f,
+          /* r1, c0, RGB */ 0.0f, 1.0f, 0.0f,
+          /* r1, c1, RGB */ 0.0f, 1.0f, 0.0f,
+          /* r2, c0, RGB */ 0.0f, 0.0f, 1.0f,
+          /* r2, c1, RGB */ 0.0f, 0.0f, 1.0f,
+          /* r3, c0, RGB */ 1.0f, 1.0f, 0.0f,
+          /* r3, c1, RGB */ 1.0f, 0.0f, 1.0f,
+          /* r4, c0, RGB */ 1.0f, 1.0f, 0.0f,
+          /* r4, c1, RGB */ 1.0f, 0.0f, 1.0f,
       });
   TF_ASSERT_OK(RunOpKernel());
 
diff --git a/tensorflow/core/kernels/summary_op_test.cc b/tensorflow/core/kernels/summary_op_test.cc
index 9fd2bd2b5e..05b1687e5f 100644
--- a/tensorflow/core/kernels/summary_op_test.cc
+++ b/tensorflow/core/kernels/summary_op_test.cc
@@ -61,7 +61,7 @@ TEST_F(SummaryScalarOpTest, SimpleFloat) {
 
   // Feed and run
   AddInputFromArray<string>(TensorShape({3}), {"tag1", "tag2", "tag3"});
-  AddInputFromArray<float>(TensorShape({3}), {1.0, -0.73, 10000.0});
+  AddInputFromArray<float>(TensorShape({3}), {1.0f, -0.73f, 10000.0f});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output size.
@@ -121,7 +121,7 @@ TEST_F(SummaryScalarOpTest, Error_MismatchedSize) {
 
   // Feed and run
   AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"});
-  AddInputFromArray<float>(TensorShape({3}), {1.0, -0.73, 10000.0});
+  AddInputFromArray<float>(TensorShape({3}), {1.0f, -0.73f, 10000.0f});
   Status s = RunOpKernel();
   EXPECT_TRUE(StringPiece(s.ToString()).contains("not the same shape")) << s;
 }
@@ -131,7 +131,7 @@ TEST_F(SummaryScalarOpTest, Error_WrongDimsTags) {
 
   // Feed and run
   AddInputFromArray<string>(TensorShape({2, 1}), {"tag1", "tag2"});
-  AddInputFromArray<float>(TensorShape({2}), {1.0, -0.73});
+  AddInputFromArray<float>(TensorShape({2}), {1.0f, -0.73f});
   Status s = RunOpKernel();
   EXPECT_TRUE(
       StringPiece(s.ToString()).contains("tags and values not the same shape"))
@@ -143,7 +143,7 @@ TEST_F(SummaryScalarOpTest, Error_WrongDimsValues) {
 
   // Feed and run
   AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"});
-  AddInputFromArray<float>(TensorShape({2, 1}), {1.0, -0.73});
+  AddInputFromArray<float>(TensorShape({2, 1}), {1.0f, -0.73f});
   Status s = RunOpKernel();
   EXPECT_TRUE(
       StringPiece(s.ToString()).contains("tags and values not the same shape"))
@@ -169,7 +169,8 @@ TEST_F(SummaryHistoOpTest, SimpleFloat) {
 
   // Feed and run
   AddInputFromArray<string>(TensorShape({}), {"taghisto"});
-  AddInputFromArray<float>(TensorShape({3, 2}), {0.1, -0.7, 4.1, 4., 5., 4.});
+  AddInputFromArray<float>(TensorShape({3, 2}),
+                           {0.1f, -0.7f, 4.1f, 4., 5.f, 4.f});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output size.
@@ -254,7 +255,7 @@ TEST_F(SummaryHistoOpTest, Error_WrongDimsTags) {
 
   // Feed and run
   AddInputFromArray<string>(TensorShape({2, 1}), {"tag1", "tag2"});
-  AddInputFromArray<float>(TensorShape({2}), {1.0, -0.73});
+  AddInputFromArray<float>(TensorShape({2}), {1.0f, -0.73f});
   Status s = RunOpKernel();
   EXPECT_TRUE(StringPiece(s.ToString()).contains("tags must be scalar")) << s;
 }
@@ -264,7 +265,7 @@ TEST_F(SummaryHistoOpTest, Error_TooManyTagValues) {
 
   // Feed and run
   AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"});
-  AddInputFromArray<float>(TensorShape({2, 1}), {1.0, -0.73});
+  AddInputFromArray<float>(TensorShape({2, 1}), {1.0f, -0.73f});
   Status s = RunOpKernel();
   EXPECT_TRUE(StringPiece(s.ToString()).contains("tags must be scalar")) << s;
 }
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index 733278e440..f6acdf2422 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -64,7 +64,7 @@ struct ApplyAdadelta<GPUDevice, T> {
     bcast[0] = grad.dimension(0);
     Eigen::Sizes<1> single;
 
-    accum.device(d) = accum_update * rho.reshape(single).broadcast(bcast) +
+    accum.device(d) = accum * rho.reshape(single).broadcast(bcast) +
                       grad.square() * (grad.constant(T(1)) -
                                        rho.reshape(single).broadcast(bcast));
     const auto update =
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index 1a9aa4d903..34e227156d 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -33,14 +33,31 @@ REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized").Device(DEVICE_CPU),
                         IsVariableInitializedOp);
 
 #if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                          \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("Variable").Device(DEVICE_SYCL).TypeConstraint<TYPE>("dtype"),   \
-      VariableOp);                                                          \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("VariableV2").Device(DEVICE_SYCL).TypeConstraint<TYPE>("dtype"), \
-      VariableOp);
-TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
+#define REGISTER_SYCL_KERNEL(TYPE)                                      \
+  REGISTER_KERNEL_BUILDER(                                              \
+                          Name("Variable")                              \
+                          .Device(DEVICE_SYCL)                          \
+                          .TypeConstraint<TYPE>("dtype"),               \
+                          VariableOp);                                  \
+  REGISTER_KERNEL_BUILDER(Name("VariableV2")                            \
+                          .Device(DEVICE_SYCL)                          \
+                          .TypeConstraint<TYPE>("dtype"),               \
+                          VariableOp);                                  \
+  REGISTER_KERNEL_BUILDER(Name("TemporaryVariable")                     \
+                          .Device(DEVICE_SYCL)                          \
+                          .TypeConstraint<TYPE>("dtype"),               \
+                          TemporaryVariableOp);                         \
+  REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable")              \
+                          .Device(DEVICE_SYCL)                          \
+                          .TypeConstraint<TYPE>("T"),                   \
+                          DestroyTemporaryVariableOp);                  \
+  REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized")                 \
+                          .Device(DEVICE_SYCL)                          \
+                          .TypeConstraint<TYPE>("dtype")                \
+                          .HostMemory("is_initialized"),                \
+                          IsVariableInitializedOp);
+
+REGISTER_SYCL_KERNEL(float);
 #undef REGISTER_SYCL_KERNEL
 #endif
 
diff --git a/tensorflow/core/lib/core/notification_test.cc b/tensorflow/core/lib/core/notification_test.cc
index 8cb1c895ad..9d96708b6f 100644
--- a/tensorflow/core/lib/core/notification_test.cc
+++ b/tensorflow/core/lib/core/notification_test.cc
@@ -67,7 +67,9 @@ TEST(NotificationTest, TestMultipleThreadsWaitingOnNotification) {
       ++counter;
     });
   }
-  sleep(1);
+
+  // Sleep 1 second.
+  Env::Default()->SleepForMicroseconds(1 * 1000 * 1000);
 
   EXPECT_EQ(0, counter);
 
diff --git a/tensorflow/core/lib/gtl/cleanup.h b/tensorflow/core/lib/gtl/cleanup.h
index 230cdb624b..6053e98640 100644
--- a/tensorflow/core/lib/gtl/cleanup.h
+++ b/tensorflow/core/lib/gtl/cleanup.h
@@ -96,7 +96,7 @@ class Cleanup {
   bool is_released() const { return released_; }
 
  private:
-  static_assert(!std::is_reference<F>(), "F must not be a reference");
+  static_assert(!std::is_reference<F>::value, "F must not be a reference");
 
   bool released_ = false;
   F f_;
diff --git a/tensorflow/core/lib/gtl/edit_distance_test.cc b/tensorflow/core/lib/gtl/edit_distance_test.cc
index 02968b6ae8..18a400713f 100644
--- a/tensorflow/core/lib/gtl/edit_distance_test.cc
+++ b/tensorflow/core/lib/gtl/edit_distance_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/gtl/edit_distance.h"
 
+#include <cctype>
 #include <vector>
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/core/lib/strings/strcat_test.cc b/tensorflow/core/lib/strings/strcat_test.cc
index 25561f1bd1..c556b1f676 100644
--- a/tensorflow/core/lib/strings/strcat_test.cc
+++ b/tensorflow/core/lib/strings/strcat_test.cc
@@ -22,6 +22,11 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
+#ifdef _MSC_VER
+// ssize_t is not a standard C++ type.
+typedef ptrdiff_t ssize_t;
+#endif
+
 namespace tensorflow {
 namespace strings {
 
diff --git a/tensorflow/core/ops/nn_ops_test.cc b/tensorflow/core/ops/nn_ops_test.cc
index 3618769dc0..974d7aa87b 100644
--- a/tensorflow/core/ops/nn_ops_test.cc
+++ b/tensorflow/core/ops/nn_ops_test.cc
@@ -507,7 +507,7 @@ TEST(NNOpsTest, FractionalPool_ShapeFn) {
                        .Finalize(&op.node_def));
     };
 
-    set_op(std::vector<float>{2.0, 1, 1 / 1.5, 1 / 2.0});
+    set_op(std::vector<float>{2.0f, 1, 1 / 1.5f, 1 / 2.0f});
 
     // Rank check.
     INFER_ERROR("must be rank 4", op, "[?,?,?]");
diff --git a/tensorflow/core/ops/state_ops.cc b/tensorflow/core/ops/state_ops.cc
index d1f63589ea..8370e57b88 100644
--- a/tensorflow/core/ops/state_ops.cc
+++ b/tensorflow/core/ops/state_ops.cc
@@ -295,7 +295,7 @@ This operation outputs `ref` after the update is done.
 This makes it easier to chain operations that need to use the reset value.
 
 If values in `ref` is to be updated more than once, because there are
-duplicate entires in `indices`, the order at which the updates happen
+duplicate entries in `indices`, the order at which the updates happen
 for each value is undefined.
 
 Requires `updates.shape = indices.shape + ref.shape[1:]`.
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 64a6ab0c7a..83a2a17d48 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -25,7 +25,7 @@ def tf_deps(deps, suffix):
   return tf_deps
 
 def tf_proto_library_cc(name, srcs = [], has_services = None,
-                        deps = [], visibility = [], testonly = 0,
+                        protodeps = [], visibility = [], testonly = 0,
                         cc_libs = [],
                         cc_stubby_versions = None,
                         cc_grpc_version = None,
@@ -34,7 +34,7 @@ def tf_proto_library_cc(name, srcs = [], has_services = None,
                         js_api_version = 2, js_codegen = "jspb"):
   native.filegroup(
       name = name + "_proto_srcs",
-      srcs = srcs + tf_deps(deps, "_proto_srcs"),
+      srcs = srcs + tf_deps(protodeps, "_proto_srcs"),
       testonly = testonly,
   )
 
@@ -43,10 +43,14 @@ def tf_proto_library_cc(name, srcs = [], has_services = None,
     use_grpc_plugin = True
   cc_proto_library(
       name = name + "_cc",
-      srcs = srcs + tf_deps(deps, "_proto_srcs"),
-      deps = deps + ["@protobuf//:cc_wkt_protos"],
+      srcs = srcs,
+      deps = tf_deps(protodeps, "_cc") + ["@protobuf//:cc_wkt_protos"],
       cc_libs = cc_libs + ["@protobuf//:protobuf"],
-      copts = ["-Wno-unused-but-set-variable", "-Wno-sign-compare"],
+      copts = [
+          "-Wno-unknown-warning-option",
+          "-Wno-unused-but-set-variable",
+          "-Wno-sign-compare",
+      ],
       protoc = "@protobuf//:protoc",
       default_runtime = "@protobuf//:protobuf",
       use_grpc_plugin = use_grpc_plugin,
@@ -54,13 +58,14 @@ def tf_proto_library_cc(name, srcs = [], has_services = None,
       visibility = visibility,
   )
 
-def tf_proto_library_py(name, srcs=[], deps=[], visibility=[], testonly=0,
+def tf_proto_library_py(name, srcs=[], protodeps=[], deps=[], visibility=[],
+                        testonly=0,
                         srcs_version="PY2AND3"):
   py_proto_library(
       name = name + "_py",
       srcs = srcs,
       srcs_version = srcs_version,
-      deps = deps,
+      deps = deps + tf_deps(protodeps, "_py") + ["@protobuf//:protobuf_python"],
       protoc = "@protobuf//:protoc",
       default_runtime = "@protobuf//:protobuf_python",
       visibility = visibility,
@@ -68,15 +73,16 @@ def tf_proto_library_py(name, srcs=[], deps=[], visibility=[], testonly=0,
   )
 
 def tf_proto_library(name, srcs = [], has_services = None,
-                     deps = [], visibility = [], testonly = 0,
+                     protodeps = [], visibility = [], testonly = 0,
                      cc_libs = [],
                      cc_api_version = 2, go_api_version = 2,
                      java_api_version = 2, py_api_version = 2,
                      js_api_version = 2, js_codegen = "jspb"):
+  """Make a proto library, possibly depending on other proto libraries."""
   tf_proto_library_cc(
       name = name,
-      srcs = srcs + tf_deps(deps, "_proto_srcs"),
-      deps = deps,
+      srcs = srcs,
+      protodeps = protodeps,
       cc_libs = cc_libs,
       testonly = testonly,
       visibility = visibility,
@@ -84,9 +90,9 @@ def tf_proto_library(name, srcs = [], has_services = None,
 
   tf_proto_library_py(
       name = name,
-      srcs = srcs + tf_deps(deps, "_proto_srcs"),
+      srcs = srcs,
+      protodeps = protodeps,
       srcs_version = "PY2AND3",
-      deps = deps + ["@protobuf//:protobuf_python"],
       testonly = testonly,
       visibility = visibility,
   )
@@ -155,7 +161,16 @@ def tf_additional_test_deps():
   return []
 
 def tf_additional_test_srcs():
-  return ["platform/default/test_benchmark.cc", "platform/posix/test.cc"]
+  return [
+      "platform/default/test_benchmark.cc",
+  ] + select({
+      "//tensorflow:windows" : [
+          "platform/windows/test.cc"
+        ],
+      "//conditions:default" : [
+          "platform/posix/test.cc",
+        ],
+    })
 
 def tf_kernel_tests_linkstatic():
   return 0
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index 4ef795edcc..0857010f7c 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -10,7 +10,6 @@ exports_files(["LICENSE"])
 load("//tensorflow:tensorflow.bzl", "if_cuda")
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
-load("@local_config_cuda//cuda:platform.bzl", "cuda_library_path")
 load("@local_config_sycl//sycl:platform.bzl", "sycl_library_path")
 
 cc_library(
@@ -138,7 +137,7 @@ filegroup(
 cc_library(
     name = "cuda",
     data = [
-        "@local_config_cuda//cuda:{}".format(cuda_library_path("cudart")),
+        "@local_config_cuda//cuda:cudart",
     ],
     linkopts = select({
         "@local_config_cuda//cuda:darwin": [
diff --git a/tensorflow/core/platform/default/logging.cc b/tensorflow/core/platform/default/logging.cc
index e7808ca08d..1d03725c78 100644
--- a/tensorflow/core/platform/default/logging.cc
+++ b/tensorflow/core/platform/default/logging.cc
@@ -81,7 +81,41 @@ void LogMessage::GenerateLogMessage() {
 }
 #endif
 
-LogMessage::~LogMessage() { GenerateLogMessage(); }
+
+namespace {
+
+int64 MinLogLevel() {
+  const char* tf_env_var_val = getenv("TF_CPP_MIN_LOG_LEVEL");
+  if (tf_env_var_val == nullptr) {
+    return 0;
+  }
+
+  // Ideally we would use env_var / safe_strto64, but it is
+  // hard to use here without pulling in a lot of dependencies,
+  // so we do a poor-man's parsing.
+  string min_log_level(tf_env_var_val);
+  if (min_log_level == "1") {
+    // Maps to WARNING
+    return 1;
+  } else if (min_log_level == "2") {
+    // Maps to ERROR
+    return 2;
+  } else if (min_log_level == "3") {
+    // Maps to FATAL
+    return 3;
+  } else {
+    // Maps to INFO (the default).
+    return 0;
+  }
+}
+
+}  // namespace
+
+LogMessage::~LogMessage() {
+  // Read the min log level once during the first call to logging.
+  static int64 min_log_level = MinLogLevel();
+  if (TF_PREDICT_TRUE(severity_ >= min_log_level)) GenerateLogMessage();
+}
 
 LogMessageFatal::LogMessageFatal(const char* file, int line)
     : LogMessage(file, line, FATAL) {}
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 787ebe654b..428a45576f 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -208,12 +208,10 @@ class Env {
   // TODO(jeff,sanjay): if needed, tighten spec so relative to epoch, or
   // provide a routine to get the absolute time.
 
-  /// \brief Returns the number of micro-seconds since some fixed point in
-  /// time. Only useful for computing deltas of time.
+  /// \brief Returns the number of micro-seconds since the Unix epoch.
   virtual uint64 NowMicros() = 0;
 
-  /// \brief Returns the number of seconds since some fixed point in
-  /// time. Only useful for computing deltas of time.
+  /// \brief Returns the number of seconds since the Unix epoch.
   virtual uint64 NowSeconds() { return NowMicros() / 1000000L; }
 
   /// Sleeps/delays the thread for the prescribed number of micro-seconds.
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 3de3b17517..b0f0cbe3f1 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -112,6 +112,11 @@ class LibHDFS {
     }
     string path = io::JoinPath(hdfs_home, "lib", "native", "libhdfs.so");
     status_ = TryLoadAndBind(path.c_str(), &handle_);
+    if (!status_.ok()) {
+      // try load libhdfs.so using dynamic loader's search path in case libhdfs.so
+      // is installed in non-standard location
+      status_ = TryLoadAndBind("libhdfs.so", &handle_);
+    }
     return;
   }
 
diff --git a/tensorflow/core/platform/port_test.cc b/tensorflow/core/platform/port_test.cc
index 78d000bff8..402c718e4f 100644
--- a/tensorflow/core/platform/port_test.cc
+++ b/tensorflow/core/platform/port_test.cc
@@ -36,8 +36,14 @@ TEST(ConditionVariable, WaitForMilliseconds_Timeout) {
   mutex m;
   mutex_lock l(m);
   condition_variable cv;
+  ConditionResult result = kCond_MaybeNotified;
   time_t start = time(NULL);
-  EXPECT_EQ(WaitForMilliseconds(&l, &cv, 3000), kCond_Timeout);
+  // Condition variables are subject to spurious wakeups on some platforms,
+  // so need to check for a timeout within a loop.
+  while (result == kCond_MaybeNotified) {
+    result = WaitForMilliseconds(&l, &cv, 3000);
+  }
+  EXPECT_EQ(result, kCond_Timeout);
   time_t finish = time(NULL);
   EXPECT_GE(finish - start, 3);
 }
@@ -51,7 +57,7 @@ TEST(ConditionVariable, WaitForMilliseconds_Signalled) {
   // Sleep for just 1 second then notify.  We have a timeout of 3 secs,
   // so the condition variable will notice the cv signal before the timeout.
   pool.Schedule([&m, &cv]() {
-    sleep(1);
+    Env::Default()->SleepForMicroseconds(1 * 1000 * 1000);
     mutex_lock l(m);
     cv.notify_all();
   });
diff --git a/tensorflow/core/platform/subprocess.h b/tensorflow/core/platform/subprocess.h
index 7dfd38688d..dfdcf82173 100644
--- a/tensorflow/core/platform/subprocess.h
+++ b/tensorflow/core/platform/subprocess.h
@@ -53,7 +53,7 @@ class SubProcess;
     defined(PLATFORM_GOOGLE_ANDROID)
 #include "tensorflow/core/platform/posix/subprocess.h"
 #elif defined(PLATFORM_WINDOWS)
-#error SubProcess not yet implemented for Windows
+#include "tensorflow/core/platform/windows/subprocess.h"
 #else
 #error Define the appropriate PLATFORM_<foo> macro for this platform
 #endif
diff --git a/tensorflow/core/platform/windows/subprocess.h b/tensorflow/core/platform/windows/subprocess.h
new file mode 100644
index 0000000000..b65313363e
--- /dev/null
+++ b/tensorflow/core/platform/windows/subprocess.h
@@ -0,0 +1,27 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_PLATFORM_WINDOWS_SUBPROCESS_H_
+#define TENSORFLOW_PLATFORM_WINDOWS_SUBPROCESS_H_
+
+namespace tensorflow {
+
+// SubProcess is not yet implemented for Windows.
+class SubProcess {
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PLATFORM_WINDOWS_SUBPROCESS_H_
diff --git a/tensorflow/core/platform/windows/test.cc b/tensorflow/core/platform/windows/test.cc
new file mode 100644
index 0000000000..0ffd02ff14
--- /dev/null
+++ b/tensorflow/core/platform/windows/test.cc
@@ -0,0 +1,51 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/net.h"
+#include "tensorflow/core/platform/test.h"
+
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace testing {
+
+std::unique_ptr<SubProcess> CreateSubProcess(const std::vector<string>& argv) {
+  LOG(FATAL) << "CreateSubProcess NOT IMPLEMENTED for Windows yet ! ";
+  return nullptr;
+}
+
+int PickUnusedPortOrDie() { return internal::PickUnusedPortOrDie(); }
+
+string TensorFlowSrcRoot() {
+  // 'bazel test' and cmake set TEST_SRCDIR.
+  // New versions of bazel also set TEST_WORKSPACE.
+  const char* env = getenv("TEST_SRCDIR");
+  const char* workspace = getenv("TEST_WORKSPACE");
+  if (env && env[0] != '\0') {
+    if (workspace && workspace[0] != '\0') {
+      return strings::StrCat(env, "/", workspace, "/tensorflow");
+    } else {
+      return strings::StrCat(env, "/tensorflow");
+    }
+  } else {
+    LOG(WARNING) << "TEST_SRCDIR environment variable not set: "
+                 << "using $PWD/tensorflow as TensorFlowSrcRoot() for tests.";
+    return "tensorflow";
+  }
+}
+
+}  // namespace testing
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc
index 31516bb2ee..670abf3fdf 100644
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@@ -467,6 +467,23 @@ Status WindowsFileSystem::RenameFile(const string& src, const string& target) {
   return result;
 }
 
+Status WindowsFileSystem::GetMatchingPaths(const string& pattern,
+                                           std::vector<string>* results) {
+  // NOTE(mrry): The existing implementation of FileSystem::GetMatchingPaths()
+  // does not handle Windows paths containing backslashes correctly. Since
+  // Windows APIs will accept forward and backslashes equivalently, we
+  // convert the pattern to use forward slashes exclusively. Note that this
+  // is not ideal, since the API expects backslash as an escape character,
+  // but no code appears to rely on this behavior.
+  string converted_pattern(pattern);
+  std::replace(converted_pattern.begin(), converted_pattern.end(), '\\', '/');
+  TF_RETURN_IF_ERROR(FileSystem::GetMatchingPaths(converted_pattern, results));
+  for (string& result : *results) {
+    std::replace(result.begin(), result.end(), '/', '\\');
+  }
+  return Status::OK();
+}
+
 Status WindowsFileSystem::Stat(const string& fname, FileStatistics* stat) {
   Status result;
   struct _stat sbuf;
diff --git a/tensorflow/core/platform/windows/windows_file_system.h b/tensorflow/core/platform/windows/windows_file_system.h
index dd83a27caf..507290e9e6 100644
--- a/tensorflow/core/platform/windows/windows_file_system.h
+++ b/tensorflow/core/platform/windows/windows_file_system.h
@@ -48,6 +48,9 @@ class WindowsFileSystem : public FileSystem {
 
   Status GetChildren(const string& dir, std::vector<string>* result) override;
 
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* result) override;
+
   Status Stat(const string& fname, FileStatistics* stat) override;
 
   Status DeleteFile(const string& fname) override;
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 1de976fb3d..34673be216 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,7 +19,7 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 0
-#define TF_MINOR_VERSION 11
+#define TF_MINOR_VERSION 12
 #define TF_PATCH_VERSION head
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
diff --git a/tensorflow/core/util/memmapped_file_system.cc b/tensorflow/core/util/memmapped_file_system.cc
index d67f948f1d..e077e94cf8 100644
--- a/tensorflow/core/util/memmapped_file_system.cc
+++ b/tensorflow/core/util/memmapped_file_system.cc
@@ -177,8 +177,13 @@ const void* MemmappedFileSystem::GetMemoryWithOffset(uint64 offset) const {
   return reinterpret_cast<const uint8*>(mapped_memory_->data()) + offset;
 }
 
+#if defined(COMPILER_MSVC)
+constexpr char* MemmappedFileSystem::kMemmappedPackagePrefix;
+constexpr char* MemmappedFileSystem::kMemmappedPackageDefaultGraphDef;
+#else
 constexpr char MemmappedFileSystem::kMemmappedPackagePrefix[];
 constexpr char MemmappedFileSystem::kMemmappedPackageDefaultGraphDef[];
+#endif
 
 Status MemmappedFileSystem::InitializeFromFile(Env* env,
                                                const string& filename) {
diff --git a/tensorflow/core/util/memmapped_file_system.h b/tensorflow/core/util/memmapped_file_system.h
index d64c4a765c..541587aeab 100644
--- a/tensorflow/core/util/memmapped_file_system.h
+++ b/tensorflow/core/util/memmapped_file_system.h
@@ -53,9 +53,19 @@ class MemmappedFileSystem : public FileSystem {
  public:
   // Memmapped regions use this prefix to distinguish from
   // the filesystem.
-  static constexpr char kMemmappedPackagePrefix[] = "memmapped_package://";
-  // The default graphdef in the package.
+#if defined(COMPILER_MSVC)
+  static constexpr char* kMemmappedPackagePrefix =
+#else
+  static constexpr char kMemmappedPackagePrefix[] =
+#endif
+      "memmapped_package://";
+
+// The default graphdef in the package.
+#if defined(COMPILER_MSVC)
+  static constexpr char* kMemmappedPackageDefaultGraphDef =
+#else
   static constexpr char kMemmappedPackageDefaultGraphDef[] =
+#endif
       "memmapped_package://.";
 
   MemmappedFileSystem();
diff --git a/tensorflow/core/util/memmapped_file_system_test.cc b/tensorflow/core/util/memmapped_file_system_test.cc
index c7d919041a..179c72c1f5 100644
--- a/tensorflow/core/util/memmapped_file_system_test.cc
+++ b/tensorflow/core/util/memmapped_file_system_test.cc
@@ -137,8 +137,15 @@ TEST(MemmappedFileSystemTest, ProxyToDefault) {
   const string dir = testing::TmpDir();
   const string filename = io::JoinPath(dir, "test_file");
   // Check that we can create write and read ordinary file.
-  std::unique_ptr<WritableFile> writable_file;
-  TF_ASSERT_OK(memmapped_env.NewAppendableFile(filename, &writable_file));
+  std::unique_ptr<WritableFile> writable_file_temp;
+  TF_ASSERT_OK(memmapped_env.NewAppendableFile(filename, &writable_file_temp));
+  // Making sure to clean up after the test finishes.
+  const auto adh = [&memmapped_env, &filename](WritableFile* f) {
+      delete f;
+      memmapped_env.DeleteFile(filename);
+  };
+  std::unique_ptr<WritableFile, decltype(adh)> writable_file(
+      writable_file_temp.release(), adh);
   const string test_string = "bla-bla-bla";
   TF_ASSERT_OK(writable_file->Append(test_string));
   TF_ASSERT_OK(writable_file->Close());
diff --git a/tensorflow/core/util/semver_test.cc b/tensorflow/core/util/semver_test.cc
index 75994a658e..0647f670c7 100644
--- a/tensorflow/core/util/semver_test.cc
+++ b/tensorflow/core/util/semver_test.cc
@@ -63,6 +63,10 @@ TEST(SemverTest, VersionStringFollowsSemver) {
   if (major == 0 && minor <= 11) {
     return;
   }
+  if (str_util::ConsumePrefix(&semver, "head")) {
+    ASSERT_TRUE(semver.empty());
+    return;
+  }
   ASSERT_TRUE(str_util::ConsumeLeadingDigits(&semver, &patch));
   if (semver.empty()) return;
   if (semver[0] == '-') {
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index a575d98da3..9d6f9e8bb5 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -17,8 +17,9 @@ limitations under the License.
 #define TENSORFLOW_UTIL_SPARSE_SPARSE_TENSOR_H_
 
 #include <limits>
-
+#include <numeric>
 #include <vector>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
diff --git a/tensorflow/core/util/stat_summarizer.cc b/tensorflow/core/util/stat_summarizer.cc
index 0b675eaac9..6bd3d9c780 100644
--- a/tensorflow/core/util/stat_summarizer.cc
+++ b/tensorflow/core/util/stat_summarizer.cc
@@ -340,10 +340,10 @@ std::string StatSummarizer::GetStatsByOrderOfNodeDefinitions(
 
 std::string StatSummarizer::GetOutputString() const {
   std::stringstream stream;
-  stream << "Total time (us): " << run_total_micros_;
+  stream << "Total time (us): " << run_total_micros_ << std::endl;
   stream << GetTimingStatsByRunOrder();
   stream << GetTimingStatsByTopDurations();
-  stream << "Total Memory (bytes): " << memory_;
+  stream << "Total Memory (bytes): " << memory_ << std::endl;
   stream << GetMemoryStatsByRunOrder();
   stream << GetMemoryStatsByUsage();
   return stream.str();
diff --git a/tensorflow/examples/label_image/main.cc b/tensorflow/examples/label_image/main.cc
index 3a927ca14b..544b1b2738 100644
--- a/tensorflow/examples/label_image/main.cc
+++ b/tensorflow/examples/label_image/main.cc
@@ -93,7 +93,8 @@ Status ReadTensorFromImageFile(string file_name, const int input_height,
 
   string input_name = "file_reader";
   string output_name = "normalized";
-  auto file_reader = ReadFile(root.WithOpName(input_name), file_name);
+  auto file_reader = tensorflow::ops::ReadFile(root.WithOpName(input_name),
+                                               file_name);
   // Now try to figure out what kind of file it is and decode it.
   const int wanted_channels = 3;
   Output image_reader;
diff --git a/tensorflow/examples/tutorials/mnist/fully_connected_feed.py b/tensorflow/examples/tutorials/mnist/fully_connected_feed.py
index fbf4000e8f..be50f4529f 100644
--- a/tensorflow/examples/tutorials/mnist/fully_connected_feed.py
+++ b/tensorflow/examples/tutorials/mnist/fully_connected_feed.py
@@ -108,7 +108,7 @@ def do_eval(sess,
                                images_placeholder,
                                labels_placeholder)
     true_count += sess.run(eval_correct, feed_dict=feed_dict)
-  precision = true_count / num_examples
+  precision = float(true_count) / num_examples
   print('  Num examples: %d  Num correct: %d  Precision @ 1: %0.04f' %
         (num_examples, true_count, precision))
 
@@ -146,7 +146,7 @@ def run_training():
     init = tf.global_variables_initializer()
 
     # Create a saver for writing training checkpoints.
-    saver = tf.train.Saver(write_version=tf.train.SaverDef.V2)
+    saver = tf.train.Saver()
 
     # Create a session for running Ops on the Graph.
     sess = tf.Session()
diff --git a/tensorflow/examples/tutorials/mnist/mnist_softmax.py b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
index 9d00c0f9af..42a406d386 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_softmax.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
@@ -25,7 +25,6 @@ from __future__ import print_function
 import argparse
 import sys
 
-# Import data
 from tensorflow.examples.tutorials.mnist import input_data
 
 import tensorflow as tf
@@ -34,6 +33,7 @@ FLAGS = None
 
 
 def main(_):
+  # Import data
   mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
 
   # Create the model
@@ -58,8 +58,8 @@ def main(_):
   train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
 
   sess = tf.InteractiveSession()
-  # Train
   tf.global_variables_initializer().run()
+  # Train
   for _ in range(1000):
     batch_xs, batch_ys = mnist.train.next_batch(100)
     sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
diff --git a/tensorflow/examples/udacity/6_lstm.ipynb b/tensorflow/examples/udacity/6_lstm.ipynb
index 159182c3fa..64e913acf8 100644
--- a/tensorflow/examples/udacity/6_lstm.ipynb
+++ b/tensorflow/examples/udacity/6_lstm.ipynb
@@ -167,10 +167,10 @@
       },
       "source": [
         "def read_data(filename):\n",
-        "  f = zipfile.ZipFile(filename)\n",
-        "  for name in f.namelist():\n",
-        "    return tf.compat.as_str(f.read(name))\n",
-        "  f.close()\n",
+        "  with zipfile.ZipFile(filename) as f:\n",
+        "    name = f.namelist()[0]\n",
+        "    data = tf.compat.as_str(f.read(name))\n",
+        "  return data\n",
         "  \n",
         "text = read_data(filename)\n",
         "print('Data size %d' % len(text))"
diff --git a/tensorflow/g3doc/api_docs/cc/ClassEnv.md b/tensorflow/g3doc/api_docs/cc/ClassEnv.md
index 88a39a5ee8..236ffdeeb2 100644
--- a/tensorflow/g3doc/api_docs/cc/ClassEnv.md
+++ b/tensorflow/g3doc/api_docs/cc/ClassEnv.md
@@ -78,9 +78,9 @@ The returned memory region can be accessed from many threads in parallel.
 
 The ownership of the returned ReadOnlyMemoryRegion is passed to the caller and the object should be deleted when is not used. The memory region object shouldn&apos;t live longer than the Env object.
 
-#### `bool tensorflow::Env::FileExists(const string &fname)` {#bool_tensorflow_Env_FileExists}
+#### `Status tensorflow::Env::FileExists(const string &fname)` {#Status_tensorflow_Env_FileExists}
 
-Returns true iff the named file exists.
+Returns OK if the named path exists and NOT_FOUND otherwise.
 
 
 
@@ -92,7 +92,7 @@ Original contents of *results are dropped.
 
 #### `virtual bool tensorflow::Env::MatchPath(const string &path, const string &pattern)=0` {#virtual_bool_tensorflow_Env_MatchPath}
 
-Returns true if the path matches the given pattern. The wildcards allowed in pattern are described below (GetMatchingPaths).
+Returns true if the path matches the given pattern. The wildcards allowed in pattern are described in FileSystem::GetMatchingPaths.
 
 
 
@@ -100,13 +100,7 @@ Returns true if the path matches the given pattern. The wildcards allowed in pat
 
 Given a pattern, stores in *results the set of paths that matches that pattern. *results is cleared.
 
-pattern must match all of a name, not just a substring. pattern: { term } term: &apos;*&apos;: matches any sequence of non-&apos;/&apos; characters &apos;?&apos;: matches a single non-&apos;/&apos; character &apos;[&apos; [ &apos;^&apos; ] { match-list } &apos;]&apos;: matches any single character (not) on the list c: matches character c (c != &apos;*&apos;, &apos;?&apos;, &apos;\&apos;, &apos;[&apos;) &apos;\&apos; c: matches character c character-range: c: matches character c (c != &apos;\&apos;, &apos;-&apos;, &apos;]&apos;) &apos;\&apos; c: matches character c lo &apos;-&apos; hi: matches character c for lo <= c <= hi
-
-Typical return codes
-
-OK - no errors
-
-UNIMPLEMENTED - Some underlying functions (like GetChildren) are not implemented The default implementation uses a combination of GetChildren, MatchPath and IsDirectory.
+More details about `pattern` in FileSystem::GetMatchingPaths.
 
 #### `Status tensorflow::Env::DeleteFile(const string &fname)` {#Status_tensorflow_Env_DeleteFile}
 
@@ -238,6 +232,12 @@ Caller takes ownership of the result and must delete it eventually (the deletion
 
 
 
+#### `virtual string tensorflow::Env::FormatLibraryFileName(const string &name, const string &version)=0` {#virtual_string_tensorflow_Env_FormatLibraryFileName}
+
+
+
+
+
 #### `static Env* tensorflow::Env::Default()` {#static_Env_tensorflow_Env_Default}
 
 Returns a default environment suitable for the current operating system.
diff --git a/tensorflow/g3doc/api_docs/cc/ClassEnvWrapper.md b/tensorflow/g3doc/api_docs/cc/ClassEnvWrapper.md
index 153dc8ca36..70462b7eb8 100644
--- a/tensorflow/g3doc/api_docs/cc/ClassEnvWrapper.md
+++ b/tensorflow/g3doc/api_docs/cc/ClassEnvWrapper.md
@@ -44,7 +44,7 @@ Returns the file system schemes registered for this Env .
 
 #### `bool tensorflow::EnvWrapper::MatchPath(const string &path, const string &pattern) override` {#bool_tensorflow_EnvWrapper_MatchPath}
 
-Returns true if the path matches the given pattern. The wildcards allowed in pattern are described below (GetMatchingPaths).
+Returns true if the path matches the given pattern. The wildcards allowed in pattern are described in FileSystem::GetMatchingPaths.
 
 
 
@@ -89,3 +89,9 @@ Caller takes ownership of the result and must delete it eventually (the deletion
 
 
 
+
+#### `string tensorflow::EnvWrapper::FormatLibraryFileName(const string &name, const string &version) override` {#string_tensorflow_EnvWrapper_FormatLibraryFileName}
+
+
+
+
diff --git a/tensorflow/g3doc/api_docs/cc/ClassStatus.md b/tensorflow/g3doc/api_docs/cc/ClassStatus.md
index a5d332128b..8956af75ec 100644
--- a/tensorflow/g3doc/api_docs/cc/ClassStatus.md
+++ b/tensorflow/g3doc/api_docs/cc/ClassStatus.md
@@ -1,6 +1,6 @@
 # `class tensorflow::Status`
 
-
+Denotes success or failure of a call in Tensorflow.
 
 
 
diff --git a/tensorflow/g3doc/api_docs/cc/ClassTensorShape.md b/tensorflow/g3doc/api_docs/cc/ClassTensorShape.md
index 5eba11a0df..51fad8c2fa 100644
--- a/tensorflow/g3doc/api_docs/cc/ClassTensorShape.md
+++ b/tensorflow/g3doc/api_docs/cc/ClassTensorShape.md
@@ -2,7 +2,11 @@
 
 
 
+Represents the shape of a Tensor .
 
+A tensor&apos;s shape is denoted by its number of dimensions and a size for each dimension. For example, a Tensor represented by a 3 x 4 matrix would have a shape of 2-D, [3,4].
+
+If you know the exact shape of your Tensor when you create the TensorShape object, you can specify it then, or you can create a TensorShape with zero dimensions and one element, and call AddDim() to add dimensions later.
 
 ###Member Details
 
diff --git a/tensorflow/g3doc/api_docs/cc/ClassTensorShapeUtils.md b/tensorflow/g3doc/api_docs/cc/ClassTensorShapeUtils.md
index 96f20e856c..7d8c36ddec 100644
--- a/tensorflow/g3doc/api_docs/cc/ClassTensorShapeUtils.md
+++ b/tensorflow/g3doc/api_docs/cc/ClassTensorShapeUtils.md
@@ -72,8 +72,14 @@ Returns a ` TensorShape ` whose dimensions are `dims[0]`, `dims[1]`, ..., `dims[
 
 
 
-#### `bool tensorflow::TensorShapeUtils::StartsWith(const TensorShape &shape0, const TensorShape &shape1)` {#bool_tensorflow_TensorShapeUtils_StartsWith}
+#### `bool tensorflow::TensorShapeUtils::StartsWith(const TensorShape &shape, const TensorShape &prefix)` {#bool_tensorflow_TensorShapeUtils_StartsWith}
 
+Returns true iff `shape` starts with `prefix`.
 
 
 
+#### `bool tensorflow::TensorShapeUtils::EndsWith(const TensorShape &shape, const TensorShape &suffix)` {#bool_tensorflow_TensorShapeUtils_EndsWith}
+
+Returns true iff `shape` ends with `suffix`.
+
+
diff --git a/tensorflow/g3doc/api_docs/cc/ClassThread.md b/tensorflow/g3doc/api_docs/cc/ClassThread.md
index 526353ec20..56127d72ad 100644
--- a/tensorflow/g3doc/api_docs/cc/ClassThread.md
+++ b/tensorflow/g3doc/api_docs/cc/ClassThread.md
@@ -1,6 +1,6 @@
 # `class tensorflow::Thread`
 
-
+Represents a thread used to run a Tensorflow function.
 
 
 
diff --git a/tensorflow/g3doc/api_docs/cc/StructTensorShapeDim.md b/tensorflow/g3doc/api_docs/cc/StructTensorShapeDim.md
index f2471b1988..509491f27c 100644
--- a/tensorflow/g3doc/api_docs/cc/StructTensorShapeDim.md
+++ b/tensorflow/g3doc/api_docs/cc/StructTensorShapeDim.md
@@ -1,6 +1,6 @@
 # `struct tensorflow::TensorShapeDim`
 
-
+Represents the value of one dimension in a TensorShape .
 
 
 
diff --git a/tensorflow/g3doc/api_docs/python/constant_op.md b/tensorflow/g3doc/api_docs/python/constant_op.md
index 41ae54f009..941915e2b1 100644
--- a/tensorflow/g3doc/api_docs/python/constant_op.md
+++ b/tensorflow/g3doc/api_docs/python/constant_op.md
@@ -170,7 +170,7 @@ fill([2, 3], 9) ==> [[9, 9, 9]
 
 - - -
 
-### `tf.constant(value, dtype=None, shape=None, name='Const')` {#constant}
+### `tf.constant(value, dtype=None, shape=None, name='Const', verify_shape=False)` {#constant}
 
 Creates a constant tensor.
 
@@ -216,6 +216,9 @@ Creates a constant tensor.
 
 *  <b>`name`</b>: Optional name for the tensor.
 
+
+*  <b>`verify_shape`</b>: Boolean that enables verification of a shape of values.
+
 ##### Returns:
 
   A Constant Tensor.
diff --git a/tensorflow/g3doc/api_docs/python/contrib.layers.md b/tensorflow/g3doc/api_docs/python/contrib.layers.md
index 8407b703a8..e3c965e704 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.layers.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.layers.md
@@ -78,7 +78,10 @@ can have speed penalty, specially in distributed settings.
     `batch_size`. The normalization is over all but the last dimension if
     `data_format` is `NHWC` and the second dimension if `data_format` is
     `NCHW`.
-*  <b>`decay`</b>: decay for the moving average.
+*  <b>`decay`</b>: decay for the moving average. Reasonable values for `decay` are close
+    to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc. Lower
+    `decay` value (recommend trying `decay`=0.9) if model experiences reasonably
+    good training performance but poor validation and/or test performance.
 *  <b>`center`</b>: If True, subtract `beta`. If False, `beta` is ignored.
 *  <b>`scale`</b>: If True, multiply by `gamma`. If False, `gamma` is
     not used. When the next layer is linear (also e.g. `nn.relu`), this can be
diff --git a/tensorflow/g3doc/api_docs/python/contrib.util.md b/tensorflow/g3doc/api_docs/python/contrib.util.md
index b46595d5d1..103b2088d7 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.util.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.util.md
@@ -43,7 +43,7 @@ permits static shape optimizations.
 
 - - -
 
-### `tf.contrib.util.make_tensor_proto(values, dtype=None, shape=None)` {#make_tensor_proto}
+### `tf.contrib.util.make_tensor_proto(values, dtype=None, shape=None, verify_shape=False)` {#make_tensor_proto}
 
 Create a TensorProto.
 
@@ -53,6 +53,7 @@ Create a TensorProto.
 *  <b>`values`</b>: Values to put in the TensorProto.
 *  <b>`dtype`</b>: Optional tensor_pb2 DataType value.
 *  <b>`shape`</b>: List of integers representing the dimensions of tensor.
+*  <b>`verify_shape`</b>: Boolean that enables verification of a shape of values.
 
 ##### Returns:
 
@@ -65,7 +66,8 @@ Create a TensorProto.
 
 
 *  <b>`TypeError`</b>: if unsupported types are provided.
-*  <b>`ValueError`</b>: if arguments have inappropriate values.
+*  <b>`ValueError`</b>: if arguments have inappropriate values or if verify_shape is
+   True and shape of values is not equals to a shape from the argument.
 
 make_tensor_proto accepts "values" of a python scalar, a python list, a
 numpy ndarray, or a numpy scalar.
diff --git a/tensorflow/g3doc/api_docs/python/framework.md b/tensorflow/g3doc/api_docs/python/framework.md
index 7aae9ea276..91e15c1028 100644
--- a/tensorflow/g3doc/api_docs/python/framework.md
+++ b/tensorflow/g3doc/api_docs/python/framework.md
@@ -2835,6 +2835,17 @@ The following standard keys are defined:
 * `WEIGHTS`: weights inside neural network layers
 * `BIASES`: biases inside neural network layers
 * `ACTIVATIONS`: activations of neural network layers
+- - -
+
+#### `tf.GraphKeys.VARIABLES` {#GraphKeys.VARIABLES}
+
+DEPRECATED FUNCTION
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2017-03-02.
+Instructions for updating:
+VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead
+
+
 
 
 ## Defining new operations
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.summary.TaggedRunMetadata.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.summary.TaggedRunMetadata.md
index 788d2066ad..8dc62c4c18 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.summary.TaggedRunMetadata.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.summary.TaggedRunMetadata.md
@@ -1,8 +1,252 @@
 
 - - -
 
+#### `tf.summary.TaggedRunMetadata.ByteSize()` {#TaggedRunMetadata.ByteSize}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.Clear()` {#TaggedRunMetadata.Clear}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.ClearExtension(extension_handle)` {#TaggedRunMetadata.ClearExtension}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.ClearField(field_name)` {#TaggedRunMetadata.ClearField}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.CopyFrom(other_msg)` {#TaggedRunMetadata.CopyFrom}
+
+Copies the content of the specified message into the current message.
+
+The method clears the current message and then merges the specified
+message using MergeFrom.
+
+##### Args:
+
+
+*  <b>`other_msg`</b>: Message to copy into the current one.
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.DiscardUnknownFields()` {#TaggedRunMetadata.DiscardUnknownFields}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.FindInitializationErrors()` {#TaggedRunMetadata.FindInitializationErrors}
+
+Finds required fields which are not initialized.
+
+##### Returns:
+
+  A list of strings.  Each string is a path to an uninitialized field from
+  the top-level message, e.g. "foo.bar[5].baz".
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.FromString(s)` {#TaggedRunMetadata.FromString}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.HasExtension(extension_handle)` {#TaggedRunMetadata.HasExtension}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.HasField(field_name)` {#TaggedRunMetadata.HasField}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.IsInitialized(errors=None)` {#TaggedRunMetadata.IsInitialized}
+
+Checks if all required fields of a message are set.
+
+##### Args:
+
+
+*  <b>`errors`</b>: A list which, if provided, will be populated with the field
+           paths of all missing required fields.
+
+##### Returns:
+
+  True iff the specified message has all required fields set.
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.ListFields()` {#TaggedRunMetadata.ListFields}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.MergeFrom(msg)` {#TaggedRunMetadata.MergeFrom}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.MergeFromString(serialized)` {#TaggedRunMetadata.MergeFromString}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.ParseFromString(serialized)` {#TaggedRunMetadata.ParseFromString}
+
+Parse serialized protocol buffer data into this message.
+
+Like MergeFromString(), except we clear the object first and
+do not return the value that MergeFromString returns.
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.RegisterExtension(extension_handle)` {#TaggedRunMetadata.RegisterExtension}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.SerializePartialToString()` {#TaggedRunMetadata.SerializePartialToString}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.SerializeToString()` {#TaggedRunMetadata.SerializeToString}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.SetInParent()` {#TaggedRunMetadata.SetInParent}
+
+Sets the _cached_byte_size_dirty bit to true,
+and propagates this to our listener iff this was a state change.
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.WhichOneof(oneof_name)` {#TaggedRunMetadata.WhichOneof}
+
+Returns the name of the currently set field inside a oneof, or None.
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.__deepcopy__(memo=None)` {#TaggedRunMetadata.__deepcopy__}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.__eq__(other)` {#TaggedRunMetadata.__eq__}
+
+
+
+
+- - -
+
 #### `tf.summary.TaggedRunMetadata.__getstate__()` {#TaggedRunMetadata.__getstate__}
 
 Support the pickle protocol.
 
 
+- - -
+
+#### `tf.summary.TaggedRunMetadata.__hash__()` {#TaggedRunMetadata.__hash__}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.__init__(**kwargs)` {#TaggedRunMetadata.__init__}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.__ne__(other_msg)` {#TaggedRunMetadata.__ne__}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.__repr__()` {#TaggedRunMetadata.__repr__}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.__setstate__(state)` {#TaggedRunMetadata.__setstate__}
+
+Support the pickle protocol.
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.__str__()` {#TaggedRunMetadata.__str__}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.__unicode__()` {#TaggedRunMetadata.__unicode__}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.run_metadata` {#TaggedRunMetadata.run_metadata}
+
+Magic attribute generated for "run_metadata" proto field.
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.tag` {#TaggedRunMetadata.tag}
+
+Magic attribute generated for "tag" proto field.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.constant.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.constant.md
index ff34b6eeb1..3cc1e1ac0a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.constant.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.constant.md
@@ -1,4 +1,4 @@
-### `tf.constant(value, dtype=None, shape=None, name='Const')` {#constant}
+### `tf.constant(value, dtype=None, shape=None, name='Const', verify_shape=False)` {#constant}
 
 Creates a constant tensor.
 
@@ -44,6 +44,9 @@ Creates a constant tensor.
 
 *  <b>`name`</b>: Optional name for the tensor.
 
+
+*  <b>`verify_shape`</b>: Boolean that enables verification of a shape of values.
+
 ##### Returns:
 
   A Constant Tensor.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.summary.SummaryDescription.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.summary.SummaryDescription.md
index 19532f7cc3..bce704ef4f 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.summary.SummaryDescription.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.summary.SummaryDescription.md
@@ -1,8 +1,245 @@
 
 - - -
 
+#### `tf.summary.SummaryDescription.ByteSize()` {#SummaryDescription.ByteSize}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.Clear()` {#SummaryDescription.Clear}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.ClearExtension(extension_handle)` {#SummaryDescription.ClearExtension}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.ClearField(field_name)` {#SummaryDescription.ClearField}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.CopyFrom(other_msg)` {#SummaryDescription.CopyFrom}
+
+Copies the content of the specified message into the current message.
+
+The method clears the current message and then merges the specified
+message using MergeFrom.
+
+##### Args:
+
+
+*  <b>`other_msg`</b>: Message to copy into the current one.
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.DiscardUnknownFields()` {#SummaryDescription.DiscardUnknownFields}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.FindInitializationErrors()` {#SummaryDescription.FindInitializationErrors}
+
+Finds required fields which are not initialized.
+
+##### Returns:
+
+  A list of strings.  Each string is a path to an uninitialized field from
+  the top-level message, e.g. "foo.bar[5].baz".
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.FromString(s)` {#SummaryDescription.FromString}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.HasExtension(extension_handle)` {#SummaryDescription.HasExtension}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.HasField(field_name)` {#SummaryDescription.HasField}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.IsInitialized(errors=None)` {#SummaryDescription.IsInitialized}
+
+Checks if all required fields of a message are set.
+
+##### Args:
+
+
+*  <b>`errors`</b>: A list which, if provided, will be populated with the field
+           paths of all missing required fields.
+
+##### Returns:
+
+  True iff the specified message has all required fields set.
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.ListFields()` {#SummaryDescription.ListFields}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.MergeFrom(msg)` {#SummaryDescription.MergeFrom}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.MergeFromString(serialized)` {#SummaryDescription.MergeFromString}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.ParseFromString(serialized)` {#SummaryDescription.ParseFromString}
+
+Parse serialized protocol buffer data into this message.
+
+Like MergeFromString(), except we clear the object first and
+do not return the value that MergeFromString returns.
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.RegisterExtension(extension_handle)` {#SummaryDescription.RegisterExtension}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.SerializePartialToString()` {#SummaryDescription.SerializePartialToString}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.SerializeToString()` {#SummaryDescription.SerializeToString}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.SetInParent()` {#SummaryDescription.SetInParent}
+
+Sets the _cached_byte_size_dirty bit to true,
+and propagates this to our listener iff this was a state change.
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.WhichOneof(oneof_name)` {#SummaryDescription.WhichOneof}
+
+Returns the name of the currently set field inside a oneof, or None.
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.__deepcopy__(memo=None)` {#SummaryDescription.__deepcopy__}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.__eq__(other)` {#SummaryDescription.__eq__}
+
+
+
+
+- - -
+
 #### `tf.summary.SummaryDescription.__getstate__()` {#SummaryDescription.__getstate__}
 
 Support the pickle protocol.
 
 
+- - -
+
+#### `tf.summary.SummaryDescription.__hash__()` {#SummaryDescription.__hash__}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.__init__(**kwargs)` {#SummaryDescription.__init__}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.__ne__(other_msg)` {#SummaryDescription.__ne__}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.__repr__()` {#SummaryDescription.__repr__}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.__setstate__(state)` {#SummaryDescription.__setstate__}
+
+Support the pickle protocol.
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.__str__()` {#SummaryDescription.__str__}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.__unicode__()` {#SummaryDescription.__unicode__}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.type_hint` {#SummaryDescription.type_hint}
+
+Magic attribute generated for "type_hint" proto field.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.TestCase.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.TestCase.md
index ae38c9fe0a..598827ea70 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.TestCase.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.TestCase.md
@@ -175,125 +175,6 @@ Checks that for all elements of farray1 and farray2
 
 - - -
 
-#### `tf.test.TestCase.assertBetween(value, minv, maxv, msg=None)` {#TestCase.assertBetween}
-
-Asserts that value is between minv and maxv (inclusive).
-
-
-- - -
-
-#### `tf.test.TestCase.assertCommandFails(command, regexes, env=None, close_fds=True, msg=None)` {#TestCase.assertCommandFails}
-
-Asserts a shell command fails and the error matches a regex in a list.
-
-##### Args:
-
-
-*  <b>`command`</b>: List or string representing the command to run.
-*  <b>`regexes`</b>: the list of regular expression strings.
-*  <b>`env`</b>: Dictionary of environment variable settings.
-*  <b>`close_fds`</b>: Whether or not to close all open fd's in the child after
-    forking.
-*  <b>`msg`</b>: Optional message to report on failure.
-
-
-- - -
-
-#### `tf.test.TestCase.assertCommandSucceeds(command, regexes=('',), env=None, close_fds=True, msg=None)` {#TestCase.assertCommandSucceeds}
-
-Asserts that a shell command succeeds (i.e. exits with code 0).
-
-##### Args:
-
-
-*  <b>`command`</b>: List or string representing the command to run.
-*  <b>`regexes`</b>: List of regular expression byte strings that match success.
-*  <b>`env`</b>: Dictionary of environment variable settings.
-*  <b>`close_fds`</b>: Whether or not to close all open fd's in the child after
-    forking.
-*  <b>`msg`</b>: Optional message to report on failure.
-
-
-- - -
-
-#### `tf.test.TestCase.assertContainsExactSubsequence(container, subsequence, msg=None)` {#TestCase.assertContainsExactSubsequence}
-
-Assert that "container" contains "subsequence" as an exact subsequence.
-
-Asserts that "container" contains all the elements of "subsequence", in
-order, and without other elements interspersed. For example, [1, 2, 3] is an
-exact subsequence of [0, 0, 1, 2, 3, 0] but not of [0, 0, 1, 2, 0, 3, 0].
-
-##### Args:
-
-
-*  <b>`container`</b>: the list we're testing for subsequence inclusion.
-*  <b>`subsequence`</b>: the list we hope will be an exact subsequence of container.
-*  <b>`msg`</b>: Optional message to report on failure.
-
-
-- - -
-
-#### `tf.test.TestCase.assertContainsInOrder(strings, target, msg=None)` {#TestCase.assertContainsInOrder}
-
-Asserts that the strings provided are found in the target in order.
-
-This may be useful for checking HTML output.
-
-##### Args:
-
-
-*  <b>`strings`</b>: A list of strings, such as [ 'fox', 'dog' ]
-*  <b>`target`</b>: A target string in which to look for the strings, such as
-    'The quick brown fox jumped over the lazy dog'.
-*  <b>`msg`</b>: Optional message to report on failure.
-
-
-- - -
-
-#### `tf.test.TestCase.assertContainsSubsequence(container, subsequence, msg=None)` {#TestCase.assertContainsSubsequence}
-
-Assert that "container" contains "subsequence" as a subsequence.
-
-Asserts that "container" contains all the elements of "subsequence", in
-order, but possibly with other elements interspersed. For example, [1, 2, 3]
-is a subsequence of [0, 0, 1, 2, 0, 3, 0] but not of [0, 0, 1, 3, 0, 2, 0].
-
-##### Args:
-
-
-*  <b>`container`</b>: the list we're testing for subsequence inclusion.
-*  <b>`subsequence`</b>: the list we hope will be a subsequence of container.
-*  <b>`msg`</b>: Optional message to report on failure.
-
-
-- - -
-
-#### `tf.test.TestCase.assertContainsSubset(expected_subset, actual_set, msg=None)` {#TestCase.assertContainsSubset}
-
-Checks whether actual iterable is a superset of expected iterable.
-
-
-- - -
-
-#### `tf.test.TestCase.assertCountEqual(*args, **kwargs)` {#TestCase.assertCountEqual}
-
-An unordered sequence specific comparison.
-
-Equivalent to assertItemsEqual(). This method is a compatibility layer
-for Python 3k, since 2to3 does not convert assertItemsEqual() calls into
-assertCountEqual() calls.
-
-##### Args:
-
-
-*  <b>`expected_seq`</b>: A sequence containing elements we are expecting.
-*  <b>`actual_seq`</b>: The sequence that we are testing.
-*  <b>`msg`</b>: The message to be printed if the test fails.
-
-
-- - -
-
 #### `tf.test.TestCase.assertDeviceEqual(device1, device2)` {#TestCase.assertDeviceEqual}
 
 Asserts that the two given devices are the same.
@@ -314,49 +195,10 @@ Checks whether actual is a superset of expected.
 
 - - -
 
-#### `tf.test.TestCase.assertDictEqual(a, b, msg=None)` {#TestCase.assertDictEqual}
+#### `tf.test.TestCase.assertDictEqual(d1, d2, msg=None)` {#TestCase.assertDictEqual}
 
-Raises AssertionError if a and b are not equal dictionaries.
-
-##### Args:
-
-
-*  <b>`a`</b>: A dict, the expected value.
-*  <b>`b`</b>: A dict, the actual value.
-*  <b>`msg`</b>: An optional str, the associated message.
-
-##### Raises:
-
-
-*  <b>`AssertionError`</b>: if the dictionaries are not equal.
-
-
-- - -
-
-#### `tf.test.TestCase.assertEmpty(container, msg=None)` {#TestCase.assertEmpty}
-
-Assert that an object has zero length.
-
-##### Args:
-
-
-*  <b>`container`</b>: Anything that implements the collections.Sized interface.
-*  <b>`msg`</b>: Optional message to report on failure.
 
 
-- - -
-
-#### `tf.test.TestCase.assertEndsWith(actual, expected_end, msg=None)` {#TestCase.assertEndsWith}
-
-Assert that actual.endswith(expected_end) is True.
-
-##### Args:
-
-
-*  <b>`actual`</b>: str
-*  <b>`expected_end`</b>: str
-*  <b>`msg`</b>: Optional message to report on failure.
-
 
 - - -
 
@@ -440,11 +282,10 @@ Included for symmetry with assertIsNone.
 
 - - -
 
-#### `tf.test.TestCase.assertItemsEqual(*args, **kwargs)` {#TestCase.assertItemsEqual}
-
-An unordered sequence specific comparison.
+#### `tf.test.TestCase.assertItemsEqual(expected_seq, actual_seq, msg=None)` {#TestCase.assertItemsEqual}
 
-It asserts that actual_seq and expected_seq have the same element counts.
+An unordered sequence specific comparison. It asserts that
+actual_seq and expected_seq have the same element counts.
 Equivalent to::
 
     self.assertEqual(Counter(iter(actual_seq)),
@@ -457,30 +298,6 @@ Asserts that each element has the same count in both sequences.
     - [0, 1, 1] and [1, 0, 1] compare equal.
     - [0, 0, 1] and [0, 1] compare unequal.
 
-##### Args:
-
-
-*  <b>`expected_seq`</b>: A sequence containing elements we are expecting.
-*  <b>`actual_seq`</b>: The sequence that we are testing.
-*  <b>`msg`</b>: The message to be printed if the test fails.
-
-
-- - -
-
-#### `tf.test.TestCase.assertJsonEqual(first, second, msg=None)` {#TestCase.assertJsonEqual}
-
-Asserts that the JSON objects defined in two strings are equal.
-
-A summary of the differences will be included in the failure message
-using assertSameStructure.
-
-##### Args:
-
-
-*  <b>`first`</b>: A string contining JSON to decode and compare to second.
-*  <b>`second`</b>: A string contining JSON to decode and compare to first.
-*  <b>`msg`</b>: Additional text to include in the failure message.
-
 
 - - -
 
@@ -552,13 +369,6 @@ if not.
 
 - - -
 
-#### `tf.test.TestCase.assertNoCommonElements(expected_seq, actual_seq, msg=None)` {#TestCase.assertNoCommonElements}
-
-Checks whether actual iterable and expected iterable are disjoint.
-
-
-- - -
-
 #### `tf.test.TestCase.assertNotAlmostEqual(first, second, places=None, msg=None, delta=None)` {#TestCase.assertNotAlmostEqual}
 
 Fail if the two objects are equal as determined by their
@@ -589,33 +399,6 @@ Objects that are equal automatically fail.
 
 - - -
 
-#### `tf.test.TestCase.assertNotEmpty(container, msg=None)` {#TestCase.assertNotEmpty}
-
-Assert that an object has non-zero length.
-
-##### Args:
-
-
-*  <b>`container`</b>: Anything that implements the collections.Sized interface.
-*  <b>`msg`</b>: Optional message to report on failure.
-
-
-- - -
-
-#### `tf.test.TestCase.assertNotEndsWith(actual, unexpected_end, msg=None)` {#TestCase.assertNotEndsWith}
-
-Assert that actual.endswith(unexpected_end) is False.
-
-##### Args:
-
-
-*  <b>`actual`</b>: str
-*  <b>`unexpected_end`</b>: str
-*  <b>`msg`</b>: Optional message to report on failure.
-
-
-- - -
-
 #### `tf.test.TestCase.assertNotEqual(first, second, msg=None)` {#TestCase.assertNotEqual}
 
 Fail if the two objects are equal as determined by the '!='
@@ -653,20 +436,6 @@ Fail the test if the text matches the regular expression.
 
 - - -
 
-#### `tf.test.TestCase.assertNotStartsWith(actual, unexpected_start, msg=None)` {#TestCase.assertNotStartsWith}
-
-Assert that actual.startswith(unexpected_start) is False.
-
-##### Args:
-
-
-*  <b>`actual`</b>: str
-*  <b>`unexpected_start`</b>: str
-*  <b>`msg`</b>: Optional message to report on failure.
-
-
-- - -
-
 #### `tf.test.TestCase.assertProtoEquals(expected_message_maybe_ascii, message)` {#TestCase.assertProtoEquals}
 
 Asserts that message is same as parsed expected_message_ascii.
@@ -741,38 +510,6 @@ Asserts that the message in a raised exception matches a regexp.
 
 - - -
 
-#### `tf.test.TestCase.assertRaisesWithLiteralMatch(expected_exception, expected_exception_message, callable_obj=None, *args, **kwargs)` {#TestCase.assertRaisesWithLiteralMatch}
-
-Asserts that the message in a raised exception equals the given string.
-
-Unlike assertRaisesRegexp, this method takes a literal string, not
-a regular expression.
-
-with self.assertRaisesWithLiteralMatch(ExType, 'message'):
-  DoSomething()
-
-##### Args:
-
-
-*  <b>`expected_exception`</b>: Exception class expected to be raised.
-*  <b>`expected_exception_message`</b>: String message expected in the raised
-    exception.  For a raise exception e, expected_exception_message must
-    equal str(e).
-*  <b>`callable_obj`</b>: Function to be called, or None to return a context.
-*  <b>`args`</b>: Extra args.
-*  <b>`kwargs`</b>: Extra kwargs.
-
-##### Returns:
-
-  A context manager if callable_obj is None. Otherwise, None.
-
-##### Raises:
-
-  self.failureException if callable_obj does not raise a macthing exception.
-
-
-- - -
-
 #### `tf.test.TestCase.assertRaisesWithPredicateMatch(exception_type, expected_err_re_or_predicate)` {#TestCase.assertRaisesWithPredicateMatch}
 
 Returns a context manager to enclose code expected to raise an exception.
@@ -797,71 +534,6 @@ predicate search.
 
 - - -
 
-#### `tf.test.TestCase.assertRaisesWithRegexpMatch(expected_exception, expected_regexp, callable_obj=None, *args, **kwargs)` {#TestCase.assertRaisesWithRegexpMatch}
-
-Asserts that the message in a raised exception matches the given regexp.
-
-This is just a wrapper around assertRaisesRegexp. Please use
-assertRaisesRegexp instead of assertRaisesWithRegexpMatch.
-
-##### Args:
-
-
-*  <b>`expected_exception`</b>: Exception class expected to be raised.
-*  <b>`expected_regexp`</b>: Regexp (re pattern object or string) expected to be
-    found in error message.
-*  <b>`callable_obj`</b>: Function to be called, or None to return a context.
-*  <b>`args`</b>: Extra args.
-*  <b>`kwargs`</b>: Extra keyword args.
-
-##### Returns:
-
-  A context manager if callable_obj is None. Otherwise, None.
-
-##### Raises:
-
-  self.failureException if callable_obj does not raise a macthing exception.
-
-
-- - -
-
-#### `tf.test.TestCase.assertRegexMatch(actual_str, regexes, message=None)` {#TestCase.assertRegexMatch}
-
-Asserts that at least one regex in regexes matches str.
-
-    If possible you should use assertRegexpMatches, which is a simpler
-    version of this method. assertRegexpMatches takes a single regular
-    expression (a string or re compiled object) instead of a list.
-
-    Notes:
-    1. This function uses substring matching, i.e. the matching
-       succeeds if *any* substring of the error message matches *any*
-       regex in the list.  This is more convenient for the user than
-       full-string matching.
-
-    2. If regexes is the empty list, the matching will always fail.
-
-    3. Use regexes=[''] for a regex that will always pass.
-
-    4. '.' matches any single character *except* the newline.  To
-       match any character, use '(.|
-)'.
-
-    5. '^' matches the beginning of each line, not just the beginning
-       of the string.  Similarly, '$' matches the end of each line.
-
-    6. An exception will be thrown if regexes contains an invalid
-       regex.
-
-    Args:
-      actual_str:  The string we try to match with the items in regexes.
-      regexes:  The regular expressions we want to match against str.
-        See "Notes" above for detailed notes on how this is interpreted.
-      message:  The message to be printed if the test fails.
-
-
-- - -
-
 #### `tf.test.TestCase.assertRegexpMatches(text, expected_regexp, msg=None)` {#TestCase.assertRegexpMatches}
 
 Fail the test unless the text matches the regular expression.
@@ -869,79 +541,6 @@ Fail the test unless the text matches the regular expression.
 
 - - -
 
-#### `tf.test.TestCase.assertSameElements(expected_seq, actual_seq, msg=None)` {#TestCase.assertSameElements}
-
-Assert that two sequences have the same elements (in any order).
-
-This method, unlike assertItemsEqual, doesn't care about any
-duplicates in the expected and actual sequences.
-
-  >> assertSameElements([1, 1, 1, 0, 0, 0], [0, 1])
-  # Doesn't raise an AssertionError
-
-If possible, you should use assertItemsEqual instead of
-assertSameElements.
-
-##### Args:
-
-
-*  <b>`expected_seq`</b>: A sequence containing elements we are expecting.
-*  <b>`actual_seq`</b>: The sequence that we are testing.
-*  <b>`msg`</b>: The message to be printed if the test fails.
-
-
-- - -
-
-#### `tf.test.TestCase.assertSameStructure(a, b, aname='a', bname='b', msg=None)` {#TestCase.assertSameStructure}
-
-Asserts that two values contain the same structural content.
-
-The two arguments should be data trees consisting of trees of dicts and
-lists. They will be deeply compared by walking into the contents of dicts
-and lists; other items will be compared using the == operator.
-If the two structures differ in content, the failure message will indicate
-the location within the structures where the first difference is found.
-This may be helpful when comparing large structures.
-
-##### Args:
-
-
-*  <b>`a`</b>: The first structure to compare.
-*  <b>`b`</b>: The second structure to compare.
-*  <b>`aname`</b>: Variable name to use for the first structure in assertion messages.
-*  <b>`bname`</b>: Variable name to use for the second structure.
-*  <b>`msg`</b>: Additional text to include in the failure message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertSequenceAlmostEqual(expected_seq, actual_seq, places=None, msg=None, delta=None)` {#TestCase.assertSequenceAlmostEqual}
-
-An approximate equality assertion for ordered sequences.
-
-Fail if the two sequences are unequal as determined by their value
-differences rounded to the given number of decimal places (default 7) and
-comparing to zero, or by comparing that the difference between each value
-in the two sequences is more than the given delta.
-
-Note that decimal places (from zero) are usually not the same as significant
-digits (measured from the most signficant digit).
-
-If the two sequences compare equal then they will automatically compare
-almost equal.
-
-##### Args:
-
-
-*  <b>`expected_seq`</b>: A sequence containing elements we are expecting.
-*  <b>`actual_seq`</b>: The sequence that we are testing.
-*  <b>`places`</b>: The number of decimal places to compare.
-*  <b>`msg`</b>: The message to be printed if the test fails.
-*  <b>`delta`</b>: The OK difference between compared values.
-
-
-- - -
-
 #### `tf.test.TestCase.assertSequenceEqual(seq1, seq2, msg=None, seq_type=None)` {#TestCase.assertSequenceEqual}
 
 An equality assertion for ordered sequences (like lists and tuples).
@@ -962,26 +561,6 @@ which can be indexed, has a length, and has an equality operator.
 
 - - -
 
-#### `tf.test.TestCase.assertSequenceStartsWith(prefix, whole, msg=None)` {#TestCase.assertSequenceStartsWith}
-
-An equality assertion for the beginning of ordered sequences.
-
-If prefix is an empty sequence, it will raise an error unless whole is also
-an empty sequence.
-
-If prefix is not a sequence, it will raise an error if the first element of
-whole does not match.
-
-##### Args:
-
-
-*  <b>`prefix`</b>: A sequence expected at the beginning of the whole parameter.
-*  <b>`whole`</b>: The sequence in which to look for prefix.
-*  <b>`msg`</b>: Optional message to report on failure.
-
-
-- - -
-
 #### `tf.test.TestCase.assertSetEqual(set1, set2, msg=None)` {#TestCase.assertSetEqual}
 
 A set-specific equality assertion.
@@ -1033,51 +612,6 @@ Assert that actual.startswith(expected_start) is True.
 
 - - -
 
-#### `tf.test.TestCase.assertTotallyOrdered(*groups, **kwargs)` {#TestCase.assertTotallyOrdered}
-
-Asserts that total ordering has been implemented correctly.
-
-For example, say you have a class A that compares only on its attribute x.
-Comparators other than __lt__ are omitted for brevity.
-
-class A(object):
-  def __init__(self, x, y):
-    self.x = x
-    self.y = y
-
-  def __hash__(self):
-    return hash(self.x)
-
-  def __lt__(self, other):
-    try:
-      return self.x < other.x
-    except AttributeError:
-      return NotImplemented
-
-assertTotallyOrdered will check that instances can be ordered correctly.
-For example,
-
-self.assertTotallyOrdered(
-  [None],  # None should come before everything else.
-  [1],     # Integers sort earlier.
-  [A(1, 'a')],
-  [A(2, 'b')],  # 2 is after 1.
-  [A(3, 'c'), A(3, 'd')],  # The second argument is irrelevant.
-  [A(4, 'z')],
-  ['foo'])  # Strings sort last.
-
-##### Args:
-
-
-*  <b>`*groups`</b>: A list of groups of elements.  Each group of elements is a list
-   of objects that are equal.  The elements in each group must be less than
-   the elements in the group after it.  For example, these groups are
-   totally ordered: [None], [1], [2, 2], [3].
-*  <b>`**kwargs`</b>: optional msg keyword argument can be passed.
-
-
-- - -
-
 #### `tf.test.TestCase.assertTrue(expr, msg=None)` {#TestCase.assertTrue}
 
 Check that the expression is true.
@@ -1100,13 +634,6 @@ A tuple-specific equality assertion.
 
 - - -
 
-#### `tf.test.TestCase.assertUrlEqual(a, b, msg=None)` {#TestCase.assertUrlEqual}
-
-Asserts that urls are equal, ignoring ordering of query params.
-
-
-- - -
-
 #### `tf.test.TestCase.assert_(expr, msg=None)` {#TestCase.assert_}
 
 Check that the expression is true.
@@ -1166,9 +693,9 @@ tearDown.
 
 - - -
 
-#### `tf.test.TestCase.fail(msg=None, prefix=None)` {#TestCase.fail}
+#### `tf.test.TestCase.fail(msg=None)` {#TestCase.fail}
 
-Fail immediately with the given message, optionally prefixed.
+Fail immediately, with the given message.
 
 
 - - -
@@ -1222,13 +749,6 @@ Fail immediately with the given message, optionally prefixed.
 
 - - -
 
-#### `tf.test.TestCase.getRecordedProperties()` {#TestCase.getRecordedProperties}
-
-Return any properties that the user has recorded.
-
-
-- - -
-
 #### `tf.test.TestCase.get_temp_dir()` {#TestCase.get_temp_dir}
 
 
@@ -1243,20 +763,6 @@ Return any properties that the user has recorded.
 
 - - -
 
-#### `tf.test.TestCase.recordProperty(property_name, property_value)` {#TestCase.recordProperty}
-
-Record an arbitrary property for later use.
-
-##### Args:
-
-
-*  <b>`property_name`</b>: str, name of property to record; must be a valid XML
-    attribute name
-*  <b>`property_value`</b>: value of property; must be valid XML attribute value
-
-
-- - -
-
 #### `tf.test.TestCase.run(result=None)` {#TestCase.run}
 
 
@@ -1280,18 +786,11 @@ Hook method for setting up class fixture before running tests in the class.
 
 #### `tf.test.TestCase.shortDescription()` {#TestCase.shortDescription}
 
-Format both the test method name and the first line of its docstring.
-
-If no docstring is given, only returns the method name.
-
-This method overrides unittest.TestCase.shortDescription(), which
-only returns the first line of the docstring, obscuring the name
-of the test upon failure.
-
-##### Returns:
-
+Returns a one-line description of the test, or None if no
+description has been provided.
 
-*  <b>`desc`</b>: A short description of a test method.
+The default implementation of this method returns the first line of
+the specified test method's docstring.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.batch_norm.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.batch_norm.md
index 3e23cd1982..504157c51f 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.batch_norm.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.batch_norm.md
@@ -28,7 +28,10 @@ can have speed penalty, specially in distributed settings.
     `batch_size`. The normalization is over all but the last dimension if
     `data_format` is `NHWC` and the second dimension if `data_format` is
     `NCHW`.
-*  <b>`decay`</b>: decay for the moving average.
+*  <b>`decay`</b>: decay for the moving average. Reasonable values for `decay` are close
+    to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc. Lower
+    `decay` value (recommend trying `decay`=0.9) if model experiences reasonably
+    good training performance but poor validation and/or test performance.
 *  <b>`center`</b>: If True, subtract `beta`. If False, `beta` is ignored.
 *  <b>`scale`</b>: If True, multiply by `gamma`. If False, `gamma` is
     not used. When the next layer is linear (also e.g. `nn.relu`), this can be
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.summary.SummaryDescription.RegisterExtension.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.summary.SummaryDescription.RegisterExtension.md
new file mode 100644
index 0000000000..3cfd7103d7
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.summary.SummaryDescription.RegisterExtension.md
@@ -0,0 +1,4 @@
+#### `tf.summary.SummaryDescription.RegisterExtension(extension_handle)` {#SummaryDescription.RegisterExtension}
+
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.summary.SummaryDescription.FromString.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.summary.SummaryDescription.FromString.md
new file mode 100644
index 0000000000..24a3b3f10c
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.summary.SummaryDescription.FromString.md
@@ -0,0 +1,4 @@
+#### `tf.summary.SummaryDescription.FromString(s)` {#SummaryDescription.FromString}
+
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.replica_device_setter.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.replica_device_setter.md
index c2ed3423cf..4009cc9b30 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.replica_device_setter.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.replica_device_setter.md
@@ -24,7 +24,7 @@ For example,
 cluster_spec = {
     "ps": ["ps0:2222", "ps1:2222"],
     "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]}
-with tf.device(tf.replica_device_setter(cluster=cluster_spec)):
+with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):
   # Build your graph
   v1 = tf.Variable(...)  # assigned to /job:ps/task:0
   v2 = tf.Variable(...)  # assigned to /job:ps/task:1
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.util.make_tensor_proto.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.util.make_tensor_proto.md
index f84a59be49..0f6470c317 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.util.make_tensor_proto.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.util.make_tensor_proto.md
@@ -1,4 +1,4 @@
-### `tf.contrib.util.make_tensor_proto(values, dtype=None, shape=None)` {#make_tensor_proto}
+### `tf.contrib.util.make_tensor_proto(values, dtype=None, shape=None, verify_shape=False)` {#make_tensor_proto}
 
 Create a TensorProto.
 
@@ -8,6 +8,7 @@ Create a TensorProto.
 *  <b>`values`</b>: Values to put in the TensorProto.
 *  <b>`dtype`</b>: Optional tensor_pb2 DataType value.
 *  <b>`shape`</b>: List of integers representing the dimensions of tensor.
+*  <b>`verify_shape`</b>: Boolean that enables verification of a shape of values.
 
 ##### Returns:
 
@@ -20,7 +21,8 @@ Create a TensorProto.
 
 
 *  <b>`TypeError`</b>: if unsupported types are provided.
-*  <b>`ValueError`</b>: if arguments have inappropriate values.
+*  <b>`ValueError`</b>: if arguments have inappropriate values or if verify_shape is
+   True and shape of values is not equals to a shape from the argument.
 
 make_tensor_proto accepts "values" of a python scalar, a python list, a
 numpy ndarray, or a numpy scalar.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.summary.TaggedRunMetadata.RegisterExtension.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.summary.TaggedRunMetadata.RegisterExtension.md
new file mode 100644
index 0000000000..f2d0c042d7
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.summary.TaggedRunMetadata.RegisterExtension.md
@@ -0,0 +1,4 @@
+#### `tf.summary.TaggedRunMetadata.RegisterExtension(extension_handle)` {#TaggedRunMetadata.RegisterExtension}
+
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.GraphKeys.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.GraphKeys.md
index 74b46140d2..ff4f8f8f58 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.GraphKeys.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.GraphKeys.md
@@ -42,3 +42,14 @@ The following standard keys are defined:
 * `WEIGHTS`: weights inside neural network layers
 * `BIASES`: biases inside neural network layers
 * `ACTIVATIONS`: activations of neural network layers
+- - -
+
+#### `tf.GraphKeys.VARIABLES` {#GraphKeys.VARIABLES}
+
+DEPRECATED FUNCTION
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2017-03-02.
+Instructions for updating:
+VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.summary.TaggedRunMetadata.FromString.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.summary.TaggedRunMetadata.FromString.md
new file mode 100644
index 0000000000..613f4ebd73
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.summary.TaggedRunMetadata.FromString.md
@@ -0,0 +1,4 @@
+#### `tf.summary.TaggedRunMetadata.FromString(s)` {#TaggedRunMetadata.FromString}
+
+
+
diff --git a/tensorflow/g3doc/api_docs/python/summary.md b/tensorflow/g3doc/api_docs/python/summary.md
index 208153b3c2..f20f876ca3 100644
--- a/tensorflow/g3doc/api_docs/python/summary.md
+++ b/tensorflow/g3doc/api_docs/python/summary.md
@@ -487,11 +487,248 @@ metadata is stored in its NodeDef. This method retrieves the description.
 
 - - -
 
+#### `tf.summary.SummaryDescription.ByteSize()` {#SummaryDescription.ByteSize}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.Clear()` {#SummaryDescription.Clear}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.ClearExtension(extension_handle)` {#SummaryDescription.ClearExtension}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.ClearField(field_name)` {#SummaryDescription.ClearField}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.CopyFrom(other_msg)` {#SummaryDescription.CopyFrom}
+
+Copies the content of the specified message into the current message.
+
+The method clears the current message and then merges the specified
+message using MergeFrom.
+
+##### Args:
+
+
+*  <b>`other_msg`</b>: Message to copy into the current one.
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.DiscardUnknownFields()` {#SummaryDescription.DiscardUnknownFields}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.FindInitializationErrors()` {#SummaryDescription.FindInitializationErrors}
+
+Finds required fields which are not initialized.
+
+##### Returns:
+
+  A list of strings.  Each string is a path to an uninitialized field from
+  the top-level message, e.g. "foo.bar[5].baz".
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.FromString(s)` {#SummaryDescription.FromString}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.HasExtension(extension_handle)` {#SummaryDescription.HasExtension}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.HasField(field_name)` {#SummaryDescription.HasField}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.IsInitialized(errors=None)` {#SummaryDescription.IsInitialized}
+
+Checks if all required fields of a message are set.
+
+##### Args:
+
+
+*  <b>`errors`</b>: A list which, if provided, will be populated with the field
+           paths of all missing required fields.
+
+##### Returns:
+
+  True iff the specified message has all required fields set.
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.ListFields()` {#SummaryDescription.ListFields}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.MergeFrom(msg)` {#SummaryDescription.MergeFrom}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.MergeFromString(serialized)` {#SummaryDescription.MergeFromString}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.ParseFromString(serialized)` {#SummaryDescription.ParseFromString}
+
+Parse serialized protocol buffer data into this message.
+
+Like MergeFromString(), except we clear the object first and
+do not return the value that MergeFromString returns.
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.RegisterExtension(extension_handle)` {#SummaryDescription.RegisterExtension}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.SerializePartialToString()` {#SummaryDescription.SerializePartialToString}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.SerializeToString()` {#SummaryDescription.SerializeToString}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.SetInParent()` {#SummaryDescription.SetInParent}
+
+Sets the _cached_byte_size_dirty bit to true,
+and propagates this to our listener iff this was a state change.
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.WhichOneof(oneof_name)` {#SummaryDescription.WhichOneof}
+
+Returns the name of the currently set field inside a oneof, or None.
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.__deepcopy__(memo=None)` {#SummaryDescription.__deepcopy__}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.__eq__(other)` {#SummaryDescription.__eq__}
+
+
+
+
+- - -
+
 #### `tf.summary.SummaryDescription.__getstate__()` {#SummaryDescription.__getstate__}
 
 Support the pickle protocol.
 
 
+- - -
+
+#### `tf.summary.SummaryDescription.__hash__()` {#SummaryDescription.__hash__}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.__init__(**kwargs)` {#SummaryDescription.__init__}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.__ne__(other_msg)` {#SummaryDescription.__ne__}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.__repr__()` {#SummaryDescription.__repr__}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.__setstate__(state)` {#SummaryDescription.__setstate__}
+
+Support the pickle protocol.
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.__str__()` {#SummaryDescription.__str__}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.__unicode__()` {#SummaryDescription.__unicode__}
+
+
+
+
+- - -
+
+#### `tf.summary.SummaryDescription.type_hint` {#SummaryDescription.type_hint}
+
+Magic attribute generated for "type_hint" proto field.
+
+
 
 - - -
 
@@ -500,9 +737,253 @@ Support the pickle protocol.
 
 - - -
 
+#### `tf.summary.TaggedRunMetadata.ByteSize()` {#TaggedRunMetadata.ByteSize}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.Clear()` {#TaggedRunMetadata.Clear}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.ClearExtension(extension_handle)` {#TaggedRunMetadata.ClearExtension}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.ClearField(field_name)` {#TaggedRunMetadata.ClearField}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.CopyFrom(other_msg)` {#TaggedRunMetadata.CopyFrom}
+
+Copies the content of the specified message into the current message.
+
+The method clears the current message and then merges the specified
+message using MergeFrom.
+
+##### Args:
+
+
+*  <b>`other_msg`</b>: Message to copy into the current one.
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.DiscardUnknownFields()` {#TaggedRunMetadata.DiscardUnknownFields}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.FindInitializationErrors()` {#TaggedRunMetadata.FindInitializationErrors}
+
+Finds required fields which are not initialized.
+
+##### Returns:
+
+  A list of strings.  Each string is a path to an uninitialized field from
+  the top-level message, e.g. "foo.bar[5].baz".
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.FromString(s)` {#TaggedRunMetadata.FromString}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.HasExtension(extension_handle)` {#TaggedRunMetadata.HasExtension}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.HasField(field_name)` {#TaggedRunMetadata.HasField}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.IsInitialized(errors=None)` {#TaggedRunMetadata.IsInitialized}
+
+Checks if all required fields of a message are set.
+
+##### Args:
+
+
+*  <b>`errors`</b>: A list which, if provided, will be populated with the field
+           paths of all missing required fields.
+
+##### Returns:
+
+  True iff the specified message has all required fields set.
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.ListFields()` {#TaggedRunMetadata.ListFields}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.MergeFrom(msg)` {#TaggedRunMetadata.MergeFrom}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.MergeFromString(serialized)` {#TaggedRunMetadata.MergeFromString}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.ParseFromString(serialized)` {#TaggedRunMetadata.ParseFromString}
+
+Parse serialized protocol buffer data into this message.
+
+Like MergeFromString(), except we clear the object first and
+do not return the value that MergeFromString returns.
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.RegisterExtension(extension_handle)` {#TaggedRunMetadata.RegisterExtension}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.SerializePartialToString()` {#TaggedRunMetadata.SerializePartialToString}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.SerializeToString()` {#TaggedRunMetadata.SerializeToString}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.SetInParent()` {#TaggedRunMetadata.SetInParent}
+
+Sets the _cached_byte_size_dirty bit to true,
+and propagates this to our listener iff this was a state change.
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.WhichOneof(oneof_name)` {#TaggedRunMetadata.WhichOneof}
+
+Returns the name of the currently set field inside a oneof, or None.
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.__deepcopy__(memo=None)` {#TaggedRunMetadata.__deepcopy__}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.__eq__(other)` {#TaggedRunMetadata.__eq__}
+
+
+
+
+- - -
+
 #### `tf.summary.TaggedRunMetadata.__getstate__()` {#TaggedRunMetadata.__getstate__}
 
 Support the pickle protocol.
 
 
+- - -
+
+#### `tf.summary.TaggedRunMetadata.__hash__()` {#TaggedRunMetadata.__hash__}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.__init__(**kwargs)` {#TaggedRunMetadata.__init__}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.__ne__(other_msg)` {#TaggedRunMetadata.__ne__}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.__repr__()` {#TaggedRunMetadata.__repr__}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.__setstate__(state)` {#TaggedRunMetadata.__setstate__}
+
+Support the pickle protocol.
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.__str__()` {#TaggedRunMetadata.__str__}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.__unicode__()` {#TaggedRunMetadata.__unicode__}
+
+
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.run_metadata` {#TaggedRunMetadata.run_metadata}
+
+Magic attribute generated for "run_metadata" proto field.
+
+
+- - -
+
+#### `tf.summary.TaggedRunMetadata.tag` {#TaggedRunMetadata.tag}
+
+Magic attribute generated for "tag" proto field.
+
+
 
diff --git a/tensorflow/g3doc/api_docs/python/test.md b/tensorflow/g3doc/api_docs/python/test.md
index 70d30b3fab..3b2dcd48b0 100644
--- a/tensorflow/g3doc/api_docs/python/test.md
+++ b/tensorflow/g3doc/api_docs/python/test.md
@@ -215,125 +215,6 @@ Checks that for all elements of farray1 and farray2
 
 - - -
 
-#### `tf.test.TestCase.assertBetween(value, minv, maxv, msg=None)` {#TestCase.assertBetween}
-
-Asserts that value is between minv and maxv (inclusive).
-
-
-- - -
-
-#### `tf.test.TestCase.assertCommandFails(command, regexes, env=None, close_fds=True, msg=None)` {#TestCase.assertCommandFails}
-
-Asserts a shell command fails and the error matches a regex in a list.
-
-##### Args:
-
-
-*  <b>`command`</b>: List or string representing the command to run.
-*  <b>`regexes`</b>: the list of regular expression strings.
-*  <b>`env`</b>: Dictionary of environment variable settings.
-*  <b>`close_fds`</b>: Whether or not to close all open fd's in the child after
-    forking.
-*  <b>`msg`</b>: Optional message to report on failure.
-
-
-- - -
-
-#### `tf.test.TestCase.assertCommandSucceeds(command, regexes=('',), env=None, close_fds=True, msg=None)` {#TestCase.assertCommandSucceeds}
-
-Asserts that a shell command succeeds (i.e. exits with code 0).
-
-##### Args:
-
-
-*  <b>`command`</b>: List or string representing the command to run.
-*  <b>`regexes`</b>: List of regular expression byte strings that match success.
-*  <b>`env`</b>: Dictionary of environment variable settings.
-*  <b>`close_fds`</b>: Whether or not to close all open fd's in the child after
-    forking.
-*  <b>`msg`</b>: Optional message to report on failure.
-
-
-- - -
-
-#### `tf.test.TestCase.assertContainsExactSubsequence(container, subsequence, msg=None)` {#TestCase.assertContainsExactSubsequence}
-
-Assert that "container" contains "subsequence" as an exact subsequence.
-
-Asserts that "container" contains all the elements of "subsequence", in
-order, and without other elements interspersed. For example, [1, 2, 3] is an
-exact subsequence of [0, 0, 1, 2, 3, 0] but not of [0, 0, 1, 2, 0, 3, 0].
-
-##### Args:
-
-
-*  <b>`container`</b>: the list we're testing for subsequence inclusion.
-*  <b>`subsequence`</b>: the list we hope will be an exact subsequence of container.
-*  <b>`msg`</b>: Optional message to report on failure.
-
-
-- - -
-
-#### `tf.test.TestCase.assertContainsInOrder(strings, target, msg=None)` {#TestCase.assertContainsInOrder}
-
-Asserts that the strings provided are found in the target in order.
-
-This may be useful for checking HTML output.
-
-##### Args:
-
-
-*  <b>`strings`</b>: A list of strings, such as [ 'fox', 'dog' ]
-*  <b>`target`</b>: A target string in which to look for the strings, such as
-    'The quick brown fox jumped over the lazy dog'.
-*  <b>`msg`</b>: Optional message to report on failure.
-
-
-- - -
-
-#### `tf.test.TestCase.assertContainsSubsequence(container, subsequence, msg=None)` {#TestCase.assertContainsSubsequence}
-
-Assert that "container" contains "subsequence" as a subsequence.
-
-Asserts that "container" contains all the elements of "subsequence", in
-order, but possibly with other elements interspersed. For example, [1, 2, 3]
-is a subsequence of [0, 0, 1, 2, 0, 3, 0] but not of [0, 0, 1, 3, 0, 2, 0].
-
-##### Args:
-
-
-*  <b>`container`</b>: the list we're testing for subsequence inclusion.
-*  <b>`subsequence`</b>: the list we hope will be a subsequence of container.
-*  <b>`msg`</b>: Optional message to report on failure.
-
-
-- - -
-
-#### `tf.test.TestCase.assertContainsSubset(expected_subset, actual_set, msg=None)` {#TestCase.assertContainsSubset}
-
-Checks whether actual iterable is a superset of expected iterable.
-
-
-- - -
-
-#### `tf.test.TestCase.assertCountEqual(*args, **kwargs)` {#TestCase.assertCountEqual}
-
-An unordered sequence specific comparison.
-
-Equivalent to assertItemsEqual(). This method is a compatibility layer
-for Python 3k, since 2to3 does not convert assertItemsEqual() calls into
-assertCountEqual() calls.
-
-##### Args:
-
-
-*  <b>`expected_seq`</b>: A sequence containing elements we are expecting.
-*  <b>`actual_seq`</b>: The sequence that we are testing.
-*  <b>`msg`</b>: The message to be printed if the test fails.
-
-
-- - -
-
 #### `tf.test.TestCase.assertDeviceEqual(device1, device2)` {#TestCase.assertDeviceEqual}
 
 Asserts that the two given devices are the same.
@@ -354,49 +235,10 @@ Checks whether actual is a superset of expected.
 
 - - -
 
-#### `tf.test.TestCase.assertDictEqual(a, b, msg=None)` {#TestCase.assertDictEqual}
+#### `tf.test.TestCase.assertDictEqual(d1, d2, msg=None)` {#TestCase.assertDictEqual}
 
-Raises AssertionError if a and b are not equal dictionaries.
-
-##### Args:
-
-
-*  <b>`a`</b>: A dict, the expected value.
-*  <b>`b`</b>: A dict, the actual value.
-*  <b>`msg`</b>: An optional str, the associated message.
-
-##### Raises:
-
-
-*  <b>`AssertionError`</b>: if the dictionaries are not equal.
-
-
-- - -
-
-#### `tf.test.TestCase.assertEmpty(container, msg=None)` {#TestCase.assertEmpty}
-
-Assert that an object has zero length.
-
-##### Args:
-
-
-*  <b>`container`</b>: Anything that implements the collections.Sized interface.
-*  <b>`msg`</b>: Optional message to report on failure.
 
 
-- - -
-
-#### `tf.test.TestCase.assertEndsWith(actual, expected_end, msg=None)` {#TestCase.assertEndsWith}
-
-Assert that actual.endswith(expected_end) is True.
-
-##### Args:
-
-
-*  <b>`actual`</b>: str
-*  <b>`expected_end`</b>: str
-*  <b>`msg`</b>: Optional message to report on failure.
-
 
 - - -
 
@@ -480,11 +322,10 @@ Included for symmetry with assertIsNone.
 
 - - -
 
-#### `tf.test.TestCase.assertItemsEqual(*args, **kwargs)` {#TestCase.assertItemsEqual}
-
-An unordered sequence specific comparison.
+#### `tf.test.TestCase.assertItemsEqual(expected_seq, actual_seq, msg=None)` {#TestCase.assertItemsEqual}
 
-It asserts that actual_seq and expected_seq have the same element counts.
+An unordered sequence specific comparison. It asserts that
+actual_seq and expected_seq have the same element counts.
 Equivalent to::
 
     self.assertEqual(Counter(iter(actual_seq)),
@@ -497,30 +338,6 @@ Asserts that each element has the same count in both sequences.
     - [0, 1, 1] and [1, 0, 1] compare equal.
     - [0, 0, 1] and [0, 1] compare unequal.
 
-##### Args:
-
-
-*  <b>`expected_seq`</b>: A sequence containing elements we are expecting.
-*  <b>`actual_seq`</b>: The sequence that we are testing.
-*  <b>`msg`</b>: The message to be printed if the test fails.
-
-
-- - -
-
-#### `tf.test.TestCase.assertJsonEqual(first, second, msg=None)` {#TestCase.assertJsonEqual}
-
-Asserts that the JSON objects defined in two strings are equal.
-
-A summary of the differences will be included in the failure message
-using assertSameStructure.
-
-##### Args:
-
-
-*  <b>`first`</b>: A string contining JSON to decode and compare to second.
-*  <b>`second`</b>: A string contining JSON to decode and compare to first.
-*  <b>`msg`</b>: Additional text to include in the failure message.
-
 
 - - -
 
@@ -592,13 +409,6 @@ if not.
 
 - - -
 
-#### `tf.test.TestCase.assertNoCommonElements(expected_seq, actual_seq, msg=None)` {#TestCase.assertNoCommonElements}
-
-Checks whether actual iterable and expected iterable are disjoint.
-
-
-- - -
-
 #### `tf.test.TestCase.assertNotAlmostEqual(first, second, places=None, msg=None, delta=None)` {#TestCase.assertNotAlmostEqual}
 
 Fail if the two objects are equal as determined by their
@@ -629,33 +439,6 @@ Objects that are equal automatically fail.
 
 - - -
 
-#### `tf.test.TestCase.assertNotEmpty(container, msg=None)` {#TestCase.assertNotEmpty}
-
-Assert that an object has non-zero length.
-
-##### Args:
-
-
-*  <b>`container`</b>: Anything that implements the collections.Sized interface.
-*  <b>`msg`</b>: Optional message to report on failure.
-
-
-- - -
-
-#### `tf.test.TestCase.assertNotEndsWith(actual, unexpected_end, msg=None)` {#TestCase.assertNotEndsWith}
-
-Assert that actual.endswith(unexpected_end) is False.
-
-##### Args:
-
-
-*  <b>`actual`</b>: str
-*  <b>`unexpected_end`</b>: str
-*  <b>`msg`</b>: Optional message to report on failure.
-
-
-- - -
-
 #### `tf.test.TestCase.assertNotEqual(first, second, msg=None)` {#TestCase.assertNotEqual}
 
 Fail if the two objects are equal as determined by the '!='
@@ -693,20 +476,6 @@ Fail the test if the text matches the regular expression.
 
 - - -
 
-#### `tf.test.TestCase.assertNotStartsWith(actual, unexpected_start, msg=None)` {#TestCase.assertNotStartsWith}
-
-Assert that actual.startswith(unexpected_start) is False.
-
-##### Args:
-
-
-*  <b>`actual`</b>: str
-*  <b>`unexpected_start`</b>: str
-*  <b>`msg`</b>: Optional message to report on failure.
-
-
-- - -
-
 #### `tf.test.TestCase.assertProtoEquals(expected_message_maybe_ascii, message)` {#TestCase.assertProtoEquals}
 
 Asserts that message is same as parsed expected_message_ascii.
@@ -781,38 +550,6 @@ Asserts that the message in a raised exception matches a regexp.
 
 - - -
 
-#### `tf.test.TestCase.assertRaisesWithLiteralMatch(expected_exception, expected_exception_message, callable_obj=None, *args, **kwargs)` {#TestCase.assertRaisesWithLiteralMatch}
-
-Asserts that the message in a raised exception equals the given string.
-
-Unlike assertRaisesRegexp, this method takes a literal string, not
-a regular expression.
-
-with self.assertRaisesWithLiteralMatch(ExType, 'message'):
-  DoSomething()
-
-##### Args:
-
-
-*  <b>`expected_exception`</b>: Exception class expected to be raised.
-*  <b>`expected_exception_message`</b>: String message expected in the raised
-    exception.  For a raise exception e, expected_exception_message must
-    equal str(e).
-*  <b>`callable_obj`</b>: Function to be called, or None to return a context.
-*  <b>`args`</b>: Extra args.
-*  <b>`kwargs`</b>: Extra kwargs.
-
-##### Returns:
-
-  A context manager if callable_obj is None. Otherwise, None.
-
-##### Raises:
-
-  self.failureException if callable_obj does not raise a macthing exception.
-
-
-- - -
-
 #### `tf.test.TestCase.assertRaisesWithPredicateMatch(exception_type, expected_err_re_or_predicate)` {#TestCase.assertRaisesWithPredicateMatch}
 
 Returns a context manager to enclose code expected to raise an exception.
@@ -837,71 +574,6 @@ predicate search.
 
 - - -
 
-#### `tf.test.TestCase.assertRaisesWithRegexpMatch(expected_exception, expected_regexp, callable_obj=None, *args, **kwargs)` {#TestCase.assertRaisesWithRegexpMatch}
-
-Asserts that the message in a raised exception matches the given regexp.
-
-This is just a wrapper around assertRaisesRegexp. Please use
-assertRaisesRegexp instead of assertRaisesWithRegexpMatch.
-
-##### Args:
-
-
-*  <b>`expected_exception`</b>: Exception class expected to be raised.
-*  <b>`expected_regexp`</b>: Regexp (re pattern object or string) expected to be
-    found in error message.
-*  <b>`callable_obj`</b>: Function to be called, or None to return a context.
-*  <b>`args`</b>: Extra args.
-*  <b>`kwargs`</b>: Extra keyword args.
-
-##### Returns:
-
-  A context manager if callable_obj is None. Otherwise, None.
-
-##### Raises:
-
-  self.failureException if callable_obj does not raise a macthing exception.
-
-
-- - -
-
-#### `tf.test.TestCase.assertRegexMatch(actual_str, regexes, message=None)` {#TestCase.assertRegexMatch}
-
-Asserts that at least one regex in regexes matches str.
-
-    If possible you should use assertRegexpMatches, which is a simpler
-    version of this method. assertRegexpMatches takes a single regular
-    expression (a string or re compiled object) instead of a list.
-
-    Notes:
-    1. This function uses substring matching, i.e. the matching
-       succeeds if *any* substring of the error message matches *any*
-       regex in the list.  This is more convenient for the user than
-       full-string matching.
-
-    2. If regexes is the empty list, the matching will always fail.
-
-    3. Use regexes=[''] for a regex that will always pass.
-
-    4. '.' matches any single character *except* the newline.  To
-       match any character, use '(.|
-)'.
-
-    5. '^' matches the beginning of each line, not just the beginning
-       of the string.  Similarly, '$' matches the end of each line.
-
-    6. An exception will be thrown if regexes contains an invalid
-       regex.
-
-    Args:
-      actual_str:  The string we try to match with the items in regexes.
-      regexes:  The regular expressions we want to match against str.
-        See "Notes" above for detailed notes on how this is interpreted.
-      message:  The message to be printed if the test fails.
-
-
-- - -
-
 #### `tf.test.TestCase.assertRegexpMatches(text, expected_regexp, msg=None)` {#TestCase.assertRegexpMatches}
 
 Fail the test unless the text matches the regular expression.
@@ -909,79 +581,6 @@ Fail the test unless the text matches the regular expression.
 
 - - -
 
-#### `tf.test.TestCase.assertSameElements(expected_seq, actual_seq, msg=None)` {#TestCase.assertSameElements}
-
-Assert that two sequences have the same elements (in any order).
-
-This method, unlike assertItemsEqual, doesn't care about any
-duplicates in the expected and actual sequences.
-
-  >> assertSameElements([1, 1, 1, 0, 0, 0], [0, 1])
-  # Doesn't raise an AssertionError
-
-If possible, you should use assertItemsEqual instead of
-assertSameElements.
-
-##### Args:
-
-
-*  <b>`expected_seq`</b>: A sequence containing elements we are expecting.
-*  <b>`actual_seq`</b>: The sequence that we are testing.
-*  <b>`msg`</b>: The message to be printed if the test fails.
-
-
-- - -
-
-#### `tf.test.TestCase.assertSameStructure(a, b, aname='a', bname='b', msg=None)` {#TestCase.assertSameStructure}
-
-Asserts that two values contain the same structural content.
-
-The two arguments should be data trees consisting of trees of dicts and
-lists. They will be deeply compared by walking into the contents of dicts
-and lists; other items will be compared using the == operator.
-If the two structures differ in content, the failure message will indicate
-the location within the structures where the first difference is found.
-This may be helpful when comparing large structures.
-
-##### Args:
-
-
-*  <b>`a`</b>: The first structure to compare.
-*  <b>`b`</b>: The second structure to compare.
-*  <b>`aname`</b>: Variable name to use for the first structure in assertion messages.
-*  <b>`bname`</b>: Variable name to use for the second structure.
-*  <b>`msg`</b>: Additional text to include in the failure message.
-
-
-- - -
-
-#### `tf.test.TestCase.assertSequenceAlmostEqual(expected_seq, actual_seq, places=None, msg=None, delta=None)` {#TestCase.assertSequenceAlmostEqual}
-
-An approximate equality assertion for ordered sequences.
-
-Fail if the two sequences are unequal as determined by their value
-differences rounded to the given number of decimal places (default 7) and
-comparing to zero, or by comparing that the difference between each value
-in the two sequences is more than the given delta.
-
-Note that decimal places (from zero) are usually not the same as significant
-digits (measured from the most signficant digit).
-
-If the two sequences compare equal then they will automatically compare
-almost equal.
-
-##### Args:
-
-
-*  <b>`expected_seq`</b>: A sequence containing elements we are expecting.
-*  <b>`actual_seq`</b>: The sequence that we are testing.
-*  <b>`places`</b>: The number of decimal places to compare.
-*  <b>`msg`</b>: The message to be printed if the test fails.
-*  <b>`delta`</b>: The OK difference between compared values.
-
-
-- - -
-
 #### `tf.test.TestCase.assertSequenceEqual(seq1, seq2, msg=None, seq_type=None)` {#TestCase.assertSequenceEqual}
 
 An equality assertion for ordered sequences (like lists and tuples).
@@ -1002,26 +601,6 @@ which can be indexed, has a length, and has an equality operator.
 
 - - -
 
-#### `tf.test.TestCase.assertSequenceStartsWith(prefix, whole, msg=None)` {#TestCase.assertSequenceStartsWith}
-
-An equality assertion for the beginning of ordered sequences.
-
-If prefix is an empty sequence, it will raise an error unless whole is also
-an empty sequence.
-
-If prefix is not a sequence, it will raise an error if the first element of
-whole does not match.
-
-##### Args:
-
-
-*  <b>`prefix`</b>: A sequence expected at the beginning of the whole parameter.
-*  <b>`whole`</b>: The sequence in which to look for prefix.
-*  <b>`msg`</b>: Optional message to report on failure.
-
-
-- - -
-
 #### `tf.test.TestCase.assertSetEqual(set1, set2, msg=None)` {#TestCase.assertSetEqual}
 
 A set-specific equality assertion.
@@ -1073,51 +652,6 @@ Assert that actual.startswith(expected_start) is True.
 
 - - -
 
-#### `tf.test.TestCase.assertTotallyOrdered(*groups, **kwargs)` {#TestCase.assertTotallyOrdered}
-
-Asserts that total ordering has been implemented correctly.
-
-For example, say you have a class A that compares only on its attribute x.
-Comparators other than __lt__ are omitted for brevity.
-
-class A(object):
-  def __init__(self, x, y):
-    self.x = x
-    self.y = y
-
-  def __hash__(self):
-    return hash(self.x)
-
-  def __lt__(self, other):
-    try:
-      return self.x < other.x
-    except AttributeError:
-      return NotImplemented
-
-assertTotallyOrdered will check that instances can be ordered correctly.
-For example,
-
-self.assertTotallyOrdered(
-  [None],  # None should come before everything else.
-  [1],     # Integers sort earlier.
-  [A(1, 'a')],
-  [A(2, 'b')],  # 2 is after 1.
-  [A(3, 'c'), A(3, 'd')],  # The second argument is irrelevant.
-  [A(4, 'z')],
-  ['foo'])  # Strings sort last.
-
-##### Args:
-
-
-*  <b>`*groups`</b>: A list of groups of elements.  Each group of elements is a list
-   of objects that are equal.  The elements in each group must be less than
-   the elements in the group after it.  For example, these groups are
-   totally ordered: [None], [1], [2, 2], [3].
-*  <b>`**kwargs`</b>: optional msg keyword argument can be passed.
-
-
-- - -
-
 #### `tf.test.TestCase.assertTrue(expr, msg=None)` {#TestCase.assertTrue}
 
 Check that the expression is true.
@@ -1140,13 +674,6 @@ A tuple-specific equality assertion.
 
 - - -
 
-#### `tf.test.TestCase.assertUrlEqual(a, b, msg=None)` {#TestCase.assertUrlEqual}
-
-Asserts that urls are equal, ignoring ordering of query params.
-
-
-- - -
-
 #### `tf.test.TestCase.assert_(expr, msg=None)` {#TestCase.assert_}
 
 Check that the expression is true.
@@ -1206,9 +733,9 @@ tearDown.
 
 - - -
 
-#### `tf.test.TestCase.fail(msg=None, prefix=None)` {#TestCase.fail}
+#### `tf.test.TestCase.fail(msg=None)` {#TestCase.fail}
 
-Fail immediately with the given message, optionally prefixed.
+Fail immediately, with the given message.
 
 
 - - -
@@ -1262,13 +789,6 @@ Fail immediately with the given message, optionally prefixed.
 
 - - -
 
-#### `tf.test.TestCase.getRecordedProperties()` {#TestCase.getRecordedProperties}
-
-Return any properties that the user has recorded.
-
-
-- - -
-
 #### `tf.test.TestCase.get_temp_dir()` {#TestCase.get_temp_dir}
 
 
@@ -1283,20 +803,6 @@ Return any properties that the user has recorded.
 
 - - -
 
-#### `tf.test.TestCase.recordProperty(property_name, property_value)` {#TestCase.recordProperty}
-
-Record an arbitrary property for later use.
-
-##### Args:
-
-
-*  <b>`property_name`</b>: str, name of property to record; must be a valid XML
-    attribute name
-*  <b>`property_value`</b>: value of property; must be valid XML attribute value
-
-
-- - -
-
 #### `tf.test.TestCase.run(result=None)` {#TestCase.run}
 
 
@@ -1320,18 +826,11 @@ Hook method for setting up class fixture before running tests in the class.
 
 #### `tf.test.TestCase.shortDescription()` {#TestCase.shortDescription}
 
-Format both the test method name and the first line of its docstring.
-
-If no docstring is given, only returns the method name.
-
-This method overrides unittest.TestCase.shortDescription(), which
-only returns the first line of the docstring, obscuring the name
-of the test upon failure.
-
-##### Returns:
-
+Returns a one-line description of the test, or None if no
+description has been provided.
 
-*  <b>`desc`</b>: A short description of a test method.
+The default implementation of this method returns the first line of
+the specified test method's docstring.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/train.md b/tensorflow/g3doc/api_docs/python/train.md
index 680db85bc1..429bbf4d15 100644
--- a/tensorflow/g3doc/api_docs/python/train.md
+++ b/tensorflow/g3doc/api_docs/python/train.md
@@ -3472,7 +3472,7 @@ For example,
 cluster_spec = {
     "ps": ["ps0:2222", "ps1:2222"],
     "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]}
-with tf.device(tf.replica_device_setter(cluster=cluster_spec)):
+with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):
   # Build your graph
   v1 = tf.Variable(...)  # assigned to /job:ps/task:0
   v2 = tf.Variable(...)  # assigned to /job:ps/task:1
diff --git a/tensorflow/g3doc/get_started/index.md b/tensorflow/g3doc/get_started/index.md
index 61ab03b37b..1642a87ece 100644
--- a/tensorflow/g3doc/get_started/index.md
+++ b/tensorflow/g3doc/get_started/index.md
@@ -43,6 +43,9 @@ for step in range(201):
         print(step, sess.run(W), sess.run(b))
 
 # Learns best fit is W: [0.1], b: [0.3]
+
+# Close the Session when we're done.
+sess.close()
 ```
 
 The first part of this code builds the data flow graph.  TensorFlow does not
diff --git a/tensorflow/g3doc/get_started/os_setup.md b/tensorflow/g3doc/get_started/os_setup.md
index 0fd27169c1..eb7503dbb6 100644
--- a/tensorflow/g3doc/get_started/os_setup.md
+++ b/tensorflow/g3doc/get_started/os_setup.md
@@ -38,11 +38,13 @@ Docker images are listed in the corresponding installation sections.
 If you encounter installation errors, see
 [common problems](#common-problems) for some solutions.
 
-## Pip Installation
+## Pip installation
 
 [Pip](https://en.wikipedia.org/wiki/Pip_(package_manager)) is a package
 management system used to install and manage software packages written in
-Python.
+Python. We provide pip packages for TensorFlow on Linux, Mac OS X, and
+Windows. For Windows instructions, please see [Pip installation on
+Windows](#pip-installation-on-windows).
 
 The packages that will be installed or upgraded during the pip install are
 listed in the [REQUIRED_PACKAGES section of
@@ -59,41 +61,58 @@ $ sudo easy_install pip
 $ sudo easy_install --upgrade six
 ```
 
-Then, select the correct binary to install:
+We have also uploaded the CPU version of the binaries to Pypi, so you can
+simply install on Linux, Mac or Windows with:
+
+```bash
+$ pip install tensorflow
+```
+
+Note that you will need pip version 8.1 or later for the above command to work on Linux.
+
+For Windows users, you can also install the GPU version of the binary with:
+```bash
+$ pip install tensorflow-gpu
+```
+Unfortunately, this command is not yet available for Linux or Mac GPU binaries
+due to their sizes exceeding the Pypi limit.
+
+If the above commands do not work on your system or you want to install the GPU version
+of the binary on Linux or Mac, you can follow these instructions:
 
 ```bash
 # Ubuntu/Linux 64-bit, CPU only, Python 2.7
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0-cp27-none-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.12.0rc0-cp27-none-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 2.7
 # Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0-cp27-none-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.12.0rc0-cp27-none-linux_x86_64.whl
 
 # Mac OS X, CPU only, Python 2.7:
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0-py2-none-any.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.12.0rc0-py2-none-any.whl
 
 # Mac OS X, GPU enabled, Python 2.7:
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0-py2-none-any.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow_gpu-0.12.0rc0-py2-none-any.whl
 
 # Ubuntu/Linux 64-bit, CPU only, Python 3.4
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0-cp34-cp34m-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.12.0rc0-cp34-cp34m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.4
 # Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0-cp34-cp34m-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.12.0rc0-cp34-cp34m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, CPU only, Python 3.5
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0-cp35-cp35m-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.12.0rc0-cp35-cp35m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.5
 # Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0-cp35-cp35m-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.12.0rc0-cp35-cp35m-linux_x86_64.whl
 
 # Mac OS X, CPU only, Python 3.4 or 3.5:
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0-py3-none-any.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.12.0rc0-py3-none-any.whl
 
 # Mac OS X, GPU enabled, Python 3.4 or 3.5:
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0-py3-none-any.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow_gpu-0.12.0rc0-py3-none-any.whl
 ```
 
 Install TensorFlow:
@@ -114,6 +133,36 @@ protobuf dependency.
 
 You can now [test your installation](#test-the-tensorflow-installation).
 
+
+### Pip installation on Windows
+
+TensorFlow supports only 64-bit Python 3.5 on Windows. We have tested
+the pip packages with the following distributions of Python:
+
+* [Python 3.5 from python.org](https://www.python.org/downloads/release/python-352/)
+* [Python 3.5 from Anaconda](https://www.continuum.io/downloads#windows)
+
+Both distributions include pip. To install the CPU-only version of
+TensorFlow, enter the following command at a command prompt:
+
+```bat
+C:\> pip install --upgrade https://storage.googleapis.com/tensorflow/windows/cpu/tensorflow-0.12.0rc0-cp35-cp35m-win_amd64.whl
+```
+
+To install the GPU version of TensorFlow, enter the following command
+at a command prompt:
+
+```bat
+C:\> pip install --upgrade https://storage.googleapis.com/tensorflow/windows/gpu/tensorflow_gpu-0.12.0rc0-cp35-cp35m-win_amd64.whl
+```
+
+You can now [test your installation](#test-the-tensorflow-installation).
+
+You can also [use Virtualenv](#virtualenv-installation) or [Anaconda
+environments](#anaconda-installation) to manage your installation of
+TensorFlow on Windows.
+
+
 ## Virtualenv installation
 
 [Virtualenv](http://docs.python-guide.org/en/latest/dev/virtualenvs/) is a tool
@@ -159,37 +208,37 @@ Now, install TensorFlow just as you would for a regular Pip installation. First
 
 ```bash
 # Ubuntu/Linux 64-bit, CPU only, Python 2.7
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0-cp27-none-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.12.0rc0-cp27-none-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 2.7
 # Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0-cp27-none-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.12.0rc0-cp27-none-linux_x86_64.whl
 
 # Mac OS X, CPU only, Python 2.7:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0-py2-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.12.0rc0-py2-none-any.whl
 
 # Mac OS X, GPU enabled, Python 2.7:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0-py2-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow_gpu-0.12.0rc0-py2-none-any.whl
 
 # Ubuntu/Linux 64-bit, CPU only, Python 3.4
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0-cp34-cp34m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.12.0rc0-cp34-cp34m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.4
 # Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0-cp34-cp34m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.12.0rc0-cp34-cp34m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, CPU only, Python 3.5
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0-cp35-cp35m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.12.0rc0-cp35-cp35m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.5
 # Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0-cp35-cp35m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.12.0rc0-cp35-cp35m-linux_x86_64.whl
 
 # Mac OS X, CPU only, Python 3.4 or 3.5:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0-py3-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.12.0rc0-py3-none-any.whl
 
 # Mac OS X, GPU enabled, Python 3.4 or 3.5:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0-py3-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow_gpu-0.12.0rc0-py3-none-any.whl
 ```
 
 Finally install TensorFlow:
@@ -251,6 +300,19 @@ Install Anaconda:
 Follow the instructions on the [Anaconda download
 site](https://www.continuum.io/downloads).
 
+Note: If tensorflow has been installed via pip outside the Anaconda environment
+previously, then one should uninstall it if one wants to use the tensorflow 
+installed within an Anaconda environment, because Anaconda searches system 
+site-packages from `.local` with higher priority.
+```bash
+# Python 2
+$ pip uninstall tensorflow
+
+# Python 3
+$ pip3 uninstall tensorflow
+```
+
+
 Create a conda environment called `tensorflow`:
 
 ```bash
@@ -298,37 +360,37 @@ select the correct binary to install:
 
 ```bash
 # Ubuntu/Linux 64-bit, CPU only, Python 2.7
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0-cp27-none-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.12.0rc0-cp27-none-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 2.7
 # Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0-cp27-none-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.12.0rc0-cp27-none-linux_x86_64.whl
 
 # Mac OS X, CPU only, Python 2.7:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0-py2-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.12.0rc0-py2-none-any.whl
 
 # Mac OS X, GPU enabled, Python 2.7:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0-py2-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow_gpu-0.12.0rc0-py2-none-any.whl
 
 # Ubuntu/Linux 64-bit, CPU only, Python 3.4
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0-cp34-cp34m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.12.0rc0-cp34-cp34m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.4
 # Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0-cp34-cp34m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.12.0rc0-cp34-cp34m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, CPU only, Python 3.5
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0-cp35-cp35m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.12.0rc0-cp35-cp35m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.5
 # Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see "Installing from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0-cp35-cp35m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.12.0rc0-cp35-cp35m-linux_x86_64.whl
 
 # Mac OS X, CPU only, Python 3.4 or 3.5:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0-py3-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.12.0rc0-py3-none-any.whl
 
 # Mac OS X, GPU enabled, Python 3.4 or 3.5:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0-py3-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow_gpu-0.12.0rc0-py3-none-any.whl
 ```
 
 Finally install TensorFlow:
@@ -396,7 +458,7 @@ code.
 code.
 
 We also have tags with `latest` replaced by a released version (e.g.,
-`0.11.0-gpu`).
+`0.12.0-rc0-gpu`).
 
 With Docker the installation is as follows:
 
@@ -493,6 +555,12 @@ When installing from source you will build a pip wheel that you then install
 using pip. You'll need pip for that, so install it as described
 [above](#pip-installation).
 
+To build TensorFlow from source on Windows, you can use experimental
+support for [Bazel on
+Windows](https://bazel.build/versions/master/docs/windows.html) or the
+[TensorFlow CMake
+build](https://github.com/tensorflow/tensorflow/tree/r0.12/tensorflow/contrib/cmake).
+
 ### Clone the TensorFlow repository
 
 ```bash
@@ -570,8 +638,8 @@ to reflect the cuDNN version you downloaded):
 
 ``` bash
 tar xvzf cudnn-8.0-linux-x64-v5.1-ga.tgz
-sudo cp -P cuda/include/cudnn.h /usr/local/cuda/include
-sudo cp -P cuda/lib64/libcudnn* /usr/local/cuda/lib64
+sudo cp -P cuda/include/cudnn.h /usr/local/cuda/include/
+sudo cp -P cuda/lib64/libcudnn* /usr/local/cuda/lib64/
 sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
 ```
 
@@ -792,7 +860,7 @@ $ bazel build -c opt --config=cuda //tensorflow/tools/pip_package:build_pip_pack
 $ bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg
 
 # The name of the .whl file will depend on your platform.
-$ sudo pip install /tmp/tensorflow_pkg/tensorflow-0.11.0-py2-none-any.whl
+$ sudo pip install /tmp/tensorflow_pkg/tensorflow-0.12.0rc0-py2-none-any.whl
 ```
 
 ## Optimizing CPU performance
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/index.md b/tensorflow/g3doc/how_tos/adding_an_op/index.md
index 306ddaf4d4..88d0cf9e1c 100644
--- a/tensorflow/g3doc/how_tos/adding_an_op/index.md
+++ b/tensorflow/g3doc/how_tos/adding_an_op/index.md
@@ -1065,7 +1065,7 @@ def _zero_out_grad(op, grad):
   shape = array_ops.shape(to_zero)
   index = array_ops.zeros_like(shape)
   first_grad = array_ops.reshape(grad, [-1])[0]
-  to_zero_grad = sparse_ops.sparse_to_dense(index, shape, first_grad, 0)
+  to_zero_grad = sparse_ops.sparse_to_dense([index], shape, first_grad, 0)
   return [to_zero_grad]  # List of one Tensor, since we have one input
 ```
 
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/zero_out_2_test.py b/tensorflow/g3doc/how_tos/adding_an_op/zero_out_2_test.py
index 60c429eefe..2598af4b27 100644
--- a/tensorflow/g3doc/how_tos/adding_an_op/zero_out_2_test.py
+++ b/tensorflow/g3doc/how_tos/adding_an_op/zero_out_2_test.py
@@ -31,6 +31,11 @@ class ZeroOut2Test(tf.test.TestCase):
       result = zero_out_op_2.zero_out([5, 4, 3, 2, 1])
       self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0])
 
+  def test_2d(self):
+    with self.test_session():
+      result = zero_out_op_2.zero_out([[6, 5, 4], [3, 2, 1]])
+      self.assertAllEqual(result.eval(), [[6, 0, 0], [0, 0, 0]])
+
   def test_grad(self):
     with self.test_session():
       shape = (5,)
@@ -39,6 +44,14 @@ class ZeroOut2Test(tf.test.TestCase):
       err = tf.test.compute_gradient_error(x, shape, y, shape)
       self.assertLess(err, 1e-4)
 
+  def test_grad_2d(self):
+    with self.test_session():
+      shape = (2, 3)
+      x = tf.constant([[6, 5, 4], [3, 2, 1]], dtype=tf.float32)
+      y = zero_out_op_2.zero_out(x)
+      err = tf.test.compute_gradient_error(x, shape, y, shape)
+      self.assertLess(err, 1e-4)
+
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/zero_out_grad_2.py b/tensorflow/g3doc/how_tos/adding_an_op/zero_out_grad_2.py
index 9734c0ce96..dc24678e33 100644
--- a/tensorflow/g3doc/how_tos/adding_an_op/zero_out_grad_2.py
+++ b/tensorflow/g3doc/how_tos/adding_an_op/zero_out_grad_2.py
@@ -40,5 +40,5 @@ def _zero_out_grad(op, grad):
   shape = array_ops.shape(to_zero)
   index = array_ops.zeros_like(shape)
   first_grad = array_ops.reshape(grad, [-1])[0]
-  to_zero_grad = sparse_ops.sparse_to_dense(index, shape, first_grad, 0)
+  to_zero_grad = sparse_ops.sparse_to_dense([index], shape, first_grad, 0)
   return [to_zero_grad]  # List of one Tensor, since we have one input
diff --git a/tensorflow/g3doc/how_tos/hadoop/index.md b/tensorflow/g3doc/how_tos/hadoop/index.md
index c6c665b1ae..a2dd67babd 100644
--- a/tensorflow/g3doc/how_tos/hadoop/index.md
+++ b/tensorflow/g3doc/how_tos/hadoop/index.md
@@ -32,7 +32,9 @@ be set:
 source ${HADOOP_HOME}/libexec/hadoop-config.sh
 ```
 
-*   **LD_LIBRARY_PATH**: To include the path to libjvm.so. On Linux:
+*   **LD_LIBRARY_PATH**: To include the path to libjvm.so, and optionally the path 
+    to libhdfs.so if your Hadoop distribution does not install libhdfs.so in 
+    `$HADOOP_HDFS_HOME/lib/native`. On Linux:
 
 ```shell
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${JAVA_HOME}/jre/lib/amd64/server
diff --git a/tensorflow/g3doc/how_tos/supervisor/index.md b/tensorflow/g3doc/how_tos/supervisor/index.md
index 69b5562c28..7b3e7677a2 100644
--- a/tensorflow/g3doc/how_tos/supervisor/index.md
+++ b/tensorflow/g3doc/how_tos/supervisor/index.md
@@ -374,7 +374,7 @@ following keyword arguments to the `Supervisor()` constructor:
  * `ready_op`: Op to check if the model is initialized.
 
    After running the local init op, the init op, and the init function, the
-   supervisor verifies that the model is fully intialized by running the ready
+   supervisor verifies that the model is fully initialized by running the ready
    op.  This is an op that returns an empty string if the model is initialized,
    or a description of what parts of the model are not initialized if not.
 
diff --git a/tensorflow/g3doc/resources/roadmap.md b/tensorflow/g3doc/resources/roadmap.md
index 32a6da639b..51a7f9450e 100644
--- a/tensorflow/g3doc/resources/roadmap.md
+++ b/tensorflow/g3doc/resources/roadmap.md
@@ -30,7 +30,6 @@ C and C++ APIs for:
 
 ### Platforms
 * OpenCL support ([#22](https://github.com/tensorflow/tensorflow/issues/22))
-* Windows support ([#17](https://github.com/tensorflow/tensorflow/issues/17))
 
 ### Community
 * More educational resources
diff --git a/tensorflow/g3doc/tutorials/input_fn/index.md b/tensorflow/g3doc/tutorials/input_fn/index.md
index 8b3b10f242..831576433e 100644
--- a/tensorflow/g3doc/tutorials/input_fn/index.md
+++ b/tensorflow/g3doc/tutorials/input_fn/index.md
@@ -289,7 +289,7 @@ accept a _pandas_ `Dataframe` and return feature column and label values as
 
 ```python
 def input_fn(data_set):
-  feature_cols = {k: tf.constant(data_set[k].values
+  feature_cols = {k: tf.constant(data_set[k].values)
                   for k in FEATURES}
   labels = tf.constant(data_set[LABEL].values)
   return feature_cols, labels
diff --git a/tensorflow/g3doc/tutorials/wide/index.md b/tensorflow/g3doc/tutorials/wide/index.md
index b8807fa44f..d30ad11374 100644
--- a/tensorflow/g3doc/tutorials/wide/index.md
+++ b/tensorflow/g3doc/tutorials/wide/index.md
@@ -173,7 +173,7 @@ construct an [Input Reader](https://www.tensorflow.org/versions/r0.9/api_docs/py
 that represents a file or other data source, and iterates through the file as
 TensorFlow runs the graph. Each continuous column in the train or test dataframe
 will be converted into a `Tensor`, which in general is a good format to
-represent dense data. For cateogorical data, we must represent the data as a
+represent dense data. For categorical data, we must represent the data as a
 `SparseTensor`. This data format is good for representing sparse data.
 
 ```python
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 446ad26515..28b947676c 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1871,7 +1871,7 @@ tf_proto_library(
     name = "cpp_shape_inference_proto",
     srcs = ["framework/cpp_shape_inference.proto"],
     cc_api_version = 2,
-    cc_libs = ["//tensorflow/core:protos_all_cc"],
+    protodeps = ["//tensorflow/core:protos_all"],
 )
 
 py_test(
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index f06b477bc8..3bcc537779 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -114,7 +114,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 
 
-def constant(value, dtype=None, shape=None, name="Const"):
+def constant(value, dtype=None, shape=None, name="Const", verify_shape=False):
   """Creates a constant tensor.
 
    The resulting tensor is populated with values of type `dtype`, as
@@ -146,13 +146,15 @@ def constant(value, dtype=None, shape=None, name="Const"):
    ```
 
   Args:
-    value:     A constant value (or list) of output type `dtype`.
+    value:          A constant value (or list) of output type `dtype`.
 
-    dtype:     The type of the elements of the resulting tensor.
+    dtype:          The type of the elements of the resulting tensor.
 
-    shape:     Optional dimensions of resulting tensor.
+    shape:          Optional dimensions of resulting tensor.
 
-    name:      Optional name for the tensor.
+    name:           Optional name for the tensor.
+
+    verify_shape:   Boolean that enables verification of a shape of values.
 
   Returns:
     A Constant Tensor.
@@ -160,7 +162,7 @@ def constant(value, dtype=None, shape=None, name="Const"):
   g = ops.get_default_graph()
   tensor_value = attr_value_pb2.AttrValue()
   tensor_value.tensor.CopyFrom(
-      tensor_util.make_tensor_proto(value, dtype=dtype, shape=shape))
+      tensor_util.make_tensor_proto(value, dtype=dtype, shape=shape, verify_shape=verify_shape))
   dtype_value = attr_value_pb2.AttrValue(type=tensor_value.tensor.dtype)
   const_tensor = g.create_op(
       "Const", [], [dtype_value.type],
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 74390bd6a3..de03c6ac7f 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -219,6 +219,18 @@ def _NotNone(v):
     return v
 
 
+def _FilterTuple(v):
+  if not isinstance(v, (list, tuple)):
+    return v
+  if isinstance(v, tuple):
+    if not any(isinstance(x, (list, tuple)) for x in v):
+      return None
+  if isinstance(v, list):
+    if not any(isinstance(x, (list, tuple)) for x in v):
+      return _FirstNotNone([None if isinstance(x, (list, tuple)) else x for x in v])
+  return _FirstNotNone([_FilterTuple(x) for x in v])
+
+
 def _FilterInt(v):
   if isinstance(v, (list, tuple)):
     return _FirstNotNone([_FilterInt(x) for x in v])
@@ -259,29 +271,29 @@ def _FilterNotTensor(v):
 
 
 _TF_TO_IS_OK = {
-    dtypes.bool: _FilterBool,
-    dtypes.complex128: _FilterComplex,
-    dtypes.complex64: _FilterComplex,
-    dtypes.float32: _FilterFloat,
-    dtypes.float64: _FilterFloat,
-    dtypes.int16: _FilterInt,
-    dtypes.int32: _FilterInt,
-    dtypes.int64: _FilterInt,
-    dtypes.int8: _FilterInt,
-    dtypes.qint16: _FilterInt,
-    dtypes.qint32: _FilterInt,
-    dtypes.qint8: _FilterInt,
-    dtypes.quint16: _FilterInt,
-    dtypes.quint8: _FilterInt,
-    dtypes.string: _FilterStr,
-    dtypes.uint16: _FilterInt,
-    dtypes.uint8: _FilterInt,
+    dtypes.bool: [_FilterBool],
+    dtypes.complex128: [_FilterComplex],
+    dtypes.complex64: [_FilterComplex],
+    dtypes.float32: [_FilterFloat],
+    dtypes.float64: [_FilterFloat],
+    dtypes.int16: [_FilterInt],
+    dtypes.int32: [_FilterInt],
+    dtypes.int64: [_FilterInt],
+    dtypes.int8: [_FilterInt],
+    dtypes.qint16: [_FilterInt, _FilterTuple],
+    dtypes.qint32: [_FilterInt, _FilterTuple],
+    dtypes.qint8: [_FilterInt, _FilterTuple],
+    dtypes.quint16: [_FilterInt, _FilterTuple],
+    dtypes.quint8: [_FilterInt, _FilterTuple],
+    dtypes.string: [_FilterStr],
+    dtypes.uint16: [_FilterInt],
+    dtypes.uint8: [_FilterInt],
 }
 
 
 def _AssertCompatible(values, dtype):
-  fn = _TF_TO_IS_OK.get(dtype, _FilterNotTensor)
-  mismatch = fn(values)
+  fn_list = _TF_TO_IS_OK.get(dtype, [_FilterNotTensor])
+  mismatch = _FirstNotNone([fn(values) for fn in fn_list])
   if mismatch is not None:
     if dtype is None:
       raise TypeError("List of Tensors when single Tensor expected")
@@ -290,13 +302,14 @@ def _AssertCompatible(values, dtype):
                       (dtype.name, repr(mismatch), type(mismatch).__name__))
 
 
-def make_tensor_proto(values, dtype=None, shape=None):
+def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
   """Create a TensorProto.
 
   Args:
-    values:    Values to put in the TensorProto.
-    dtype:     Optional tensor_pb2 DataType value.
-    shape:     List of integers representing the dimensions of tensor.
+    values:         Values to put in the TensorProto.
+    dtype:          Optional tensor_pb2 DataType value.
+    shape:          List of integers representing the dimensions of tensor.
+    verify_shape:   Boolean that enables verification of a shape of values.
 
   Returns:
     A TensorProto. Depending on the type, it may contain data in the
@@ -306,7 +319,8 @@ def make_tensor_proto(values, dtype=None, shape=None):
 
   Raises:
     TypeError:  if unsupported types are provided.
-    ValueError: if arguments have inappropriate values.
+    ValueError: if arguments have inappropriate values or if verify_shape is
+     True and shape of values is not equals to a shape from the argument.
 
   make_tensor_proto accepts "values" of a python scalar, a python list, a
   numpy ndarray, or a numpy scalar.
@@ -396,6 +410,11 @@ def make_tensor_proto(values, dtype=None, shape=None):
     shape_size = np.prod(shape)
     is_same_size = shape_size == nparray.size
 
+    if verify_shape:
+      if not nparray.shape == tuple(shape):
+        raise TypeError("Expected Tensor's shape: %s, got %s." %
+                        (tuple(shape), nparray.shape))
+
     if nparray.size > shape_size:
       raise ValueError(
           "Too many elements provided. Needed at most %d, but received %d" %
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index 8f9af29247..20e8601d73 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -514,9 +514,23 @@ class TensorUtilTest(tf.test.TestCase):
     self.assertEquals(np.complex128, a.dtype)
     self.assertAllEqual(np.array([[(1+2j), (3+4j)], [(5+6j), (7+8j)]]), a)
 
-  def testUnsupportedDType(self):
+  def testUnsupportedDTypes(self):
     with self.assertRaises(TypeError):
       tensor_util.make_tensor_proto(np.array([1]), 0)
+    with self.assertRaises(TypeError):
+      tensor_util.make_tensor_proto(3, dtype=tf.qint8)
+    with self.assertRaises(TypeError):
+      tensor_util.make_tensor_proto([3], dtype=tf.qint8)
+
+  def testTensorShapeVerification(self):
+    array = np.array([[1], [2]])
+    correct_shape = (2, 1)
+    incorrect_shape = (1, 2)
+    tensor_util.make_tensor_proto(array, shape=correct_shape,
+        verify_shape=True)
+    with self.assertRaises(TypeError):
+      tensor_util.make_tensor_proto(array, shape=incorrect_shape,
+          verify_shape=True)
 
   def testShapeTooLarge(self):
     with self.assertRaises(ValueError):
diff --git a/tensorflow/python/framework/versions_test.py b/tensorflow/python/framework/versions_test.py
index b5bcc88e18..d94e892cd5 100644
--- a/tensorflow/python/framework/versions_test.py
+++ b/tensorflow/python/framework/versions_test.py
@@ -27,8 +27,8 @@ class VersionTest(tf.test.TestCase):
     self.assertEqual(type(tf.__version__), str)
     self.assertEqual(type(tf.VERSION), str)
     # This pattern will need to grow as we include alpha, builds, etc.
-    self.assertRegexpMatches(tf.__version__, r'^\d+\.\d+\.\w+$')
-    self.assertRegexpMatches(tf.VERSION, r'^\d+\.\d+\.\w+$')
+    self.assertRegexpMatches(tf.__version__, r'^\d+\.\d+\.(\d+(\-\w+)?|head)$')
+    self.assertRegexpMatches(tf.VERSION, r'^\d+\.\d+\.(\d+(\-\w+)?|head)$')
 
   def testGraphDefVersion(self):
     version = tf.GRAPH_DEF_VERSION
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 0712368253..be5b394ba8 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -117,6 +117,14 @@ tf_py_test(
 )
 
 tf_py_test(
+    name = "decode_image_op_test",
+    size = "small",
+    srcs = ["decode_image_op_test.py"],
+    additional_deps = ["//tensorflow:tensorflow_py"],
+    data = ["//tensorflow/core:image_testdata"],
+)
+
+tf_py_test(
     name = "decode_raw_op_test",
     size = "small",
     srcs = ["decode_raw_op_test.py"],
diff --git a/tensorflow/python/kernel_tests/basic_gpu_test.py b/tensorflow/python/kernel_tests/basic_gpu_test.py
index 541c95aa28..979347dec8 100644
--- a/tensorflow/python/kernel_tests/basic_gpu_test.py
+++ b/tensorflow/python/kernel_tests/basic_gpu_test.py
@@ -39,7 +39,7 @@ class GPUBinaryOpsTest(tf.test.TestCase):
       tf_cpu = sess.run(out)
 
     self.assertAllClose(tf_cpu, tf_gpu)
-    
+
   def testFloatBasic(self):
     x = np.linspace(-5, 20, 15).reshape(1, 3, 5).astype(np.float32)
     y = np.linspace(20, -5, 15).reshape(1, 3, 5).astype(np.float32)
@@ -47,15 +47,159 @@ class GPUBinaryOpsTest(tf.test.TestCase):
     self._compareGPU(x, y, np.subtract, tf.sub)
     self._compareGPU(x, y, np.multiply, tf.mul)
     self._compareGPU(x, y + 0.1, np.true_divide, tf.truediv)
+    self._compareGPU(x, y + 0.1, np.floor_divide, tf.floordiv)
+    self._compareGPU(x, y, np.power, tf.pow)
+
+  def testFloatWithBCast(self):
+    x = np.linspace(-5, 20, 15).reshape(3, 5).astype(np.float32)
+    y = np.linspace(20, -5, 30).reshape(2, 3, 5).astype(np.float32)
+    self._compareGPU(x, y, np.add, tf.add)
+    self._compareGPU(x, y, np.subtract, tf.sub)
+    self._compareGPU(x, y, np.multiply, tf.mul)
+    self._compareGPU(x, y + 0.1, np.true_divide, tf.truediv)
+
+  def testDoubleBasic(self):
+    x = np.linspace(-5, 20, 15).reshape(1, 3, 5).astype(np.float64)
+    y = np.linspace(20, -5, 15).reshape(1, 3, 5).astype(np.float64)
+    self._compareGPU(x, y, np.add, tf.add)
+    self._compareGPU(x, y, np.subtract, tf.sub)
+    self._compareGPU(x, y, np.multiply, tf.mul)
+    self._compareGPU(x, y + 0.1, np.true_divide, tf.truediv)
+
+  def testDoubleWithBCast(self):
+    x = np.linspace(-5, 20, 15).reshape(3, 5).astype(np.float64)
+    y = np.linspace(20, -5, 30).reshape(2, 3, 5).astype(np.float64)
+    self._compareGPU(x, y, np.add, tf.add)
+    self._compareGPU(x, y, np.subtract, tf.sub)
+    self._compareGPU(x, y, np.multiply, tf.mul)
+    self._compareGPU(x, y + 0.1, np.true_divide, tf.truediv)
+
+
+class MathBuiltinUnaryTest(tf.test.TestCase):
+  def _compare(self, x, np_func, tf_func, use_gpu):
+    np_out = np_func(x)
+    with self.test_session(use_gpu=use_gpu) as sess:
+      inx = tf.convert_to_tensor(x)
+      ofunc = tf_func(inx)
+      tf_out = sess.run(ofunc)
+    self.assertAllClose(np_out, tf_out)
+
+  def _inv(self, x):
+    return 1.0 / x
+
+  def _rsqrt(self, x):
+    return self._inv(np.sqrt(x))
+
+  def _testDtype(self, dtype, use_gpu):
+    data = (np.arange(-3, 3) / 4.).reshape([1, 3, 2]).astype(dtype)
+    self._compare(data, np.abs, tf.abs, use_gpu)
+    self._compare(data, np.arccos, tf.acos, use_gpu)
+    self._compare(data, np.arcsin, tf.asin, use_gpu)
+    self._compare(data, np.arctan, tf.atan, use_gpu)
+    self._compare(data, np.ceil, tf.ceil, use_gpu)
+    self._compare(data, np.cos, tf.cos, use_gpu)
+    self._compare(data, np.exp, tf.exp, use_gpu)
+    self._compare(data, np.floor, tf.floor, use_gpu)
+    self._compare(data, np.log, tf.log, use_gpu)
+    self._compare(data, np.log1p, tf.log1p, use_gpu)
+    self._compare(data, np.negative, tf.neg, use_gpu)
+    self._compare(data, self._rsqrt, tf.rsqrt, use_gpu)
+    self._compare(data, np.sin, tf.sin, use_gpu)
+    self._compare(data, np.sqrt, tf.sqrt, use_gpu)
+    self._compare(data, np.square, tf.square, use_gpu)
+    self._compare(data, np.tan, tf.tan, use_gpu)
+    self._compare(data, np.tanh, tf.tanh, use_gpu)
+
+  def testTypes(self):
+    for dtype in [np.float32]:
+      self._testDtype(dtype, use_gpu=True)
+
+  def testFloorDevide(self):
+    x = (1 + np.linspace(0, 5, np.prod([1, 3, 2]))).astype(np.float32).reshape([1, 3, 2])
+    y = (1 + np.linspace(0, 5, np.prod([1, 3, 2]))).astype(np.float32).reshape([1, 3, 2])
+
+    np_out = np.floor_divide(x, y + 0.1)
+
+    with self.test_session(use_gpu=True) as sess:
+      inx = tf.convert_to_tensor(x)
+      iny = tf.convert_to_tensor(y + 0.1)
+      ofunc = inx / iny
+      out_func2 = tf.floor(ofunc)
+      tf_out = sess.run(out_func2)
+
+    self.assertAllClose(np_out, tf_out)
+
+class BroadcastSimpleTest(tf.test.TestCase):
+  def _GetGradientArgs(self, xs, ys):
+    with self.test_session(use_gpu=True) as sess:
+      return sess.run(_broadcast_gradient_args(xs, ys))
+
+  def testBroadcast(self):
+    r0, r1 = self._GetGradientArgs([2, 3, 5], [1])
+    self.assertAllEqual(r0, [])
+    self.assertAllEqual(r1, [0, 1, 2])
+
+  _GRAD_TOL = {tf.float32: 1e-3}
+
+  def _compareGradientX(self, x, y, np_func, tf_func,
+                      numeric_gradient_type=None):
+    z = np_func(x, y)
+    zs = list(z.shape)
+    with self.test_session():
+      inx = tf.convert_to_tensor(x)
+      iny = tf.convert_to_tensor(y)
+      if x.dtype in (np.float32, np.float64):
+        out = 1.1 * tf_func(inx, iny)
+      else:
+        out = tf_func(inx, iny)
+      xs = list(x.shape)
+      jacob_t, jacob_n = tf.test.compute_gradient(inx,
+                                                  xs,
+                                                  out,
+                                                  zs,
+                                                  x_init_value=x)
+      tol = self._GRAD_TOL[tf.as_dtype(x.dtype)]
+      self.assertAllClose(jacob_t, jacob_n, rtol=tol, atol=tol)
+
+  def _compareGradientY(self, x, y, np_func, tf_func,
+                        numeric_gradient_type=None):
+    z = np_func(x, y)
+    zs = list(z.shape)
+    with self.test_session():
+      inx = tf.convert_to_tensor(x)
+      iny = tf.convert_to_tensor(y)
+      if x.dtype in (np.float32, np.float64):
+        out = 1.1 * tf_func(inx, iny)
+      else:
+        out = tf_func(inx, iny)
+      ys = list(np.shape(y))
+      jacob_t, jacob_n = tf.test.compute_gradient(iny,
+                                                  ys,
+                                                  out,
+                                                  zs,
+                                                  x_init_value=y)
+    tol = self._GRAD_TOL[tf.as_dtype(x.dtype)]
+    self.assertAllClose(jacob_t, jacob_n, rtol=tol, atol=tol)
+
+  def _compareGpu(self, x, y, np_func, tf_func):
+    np_ans = np_func(x, y)
+    with self.test_session(use_gpu=True):
+      inx = tf.convert_to_tensor(x)
+      iny = tf.convert_to_tensor(y)
+      out = tf_func(inx, iny)
+      tf_gpu = out.eval()
+    self.assertAllClose(np_ans, tf_gpu)
+    self.assertShapeEqual(np_ans, out)
+    # TODO(zhifengc/ke): make gradient checker work on GPU.
+
+  def testGradient(self):
+    x = (1 + np.linspace(0, 5, np.prod([1, 3, 2]))).astype(np.float32).reshape([1, 3, 2])
+    y = (1 + np.linspace(0, 5, np.prod([1, 3, 2]))).astype(np.float32).reshape([1, 3, 2])
 
-  #def _GetGradientArgs(self, xs, ys):
-    #with self.test_session(use_gpu=True) as sess:
-     # return sess.run(_broadcast_gradient_args(xs, ys))
+    self._compareGradientX(x , y, np.true_divide, tf.truediv)
+    self._compareGradientY(x, y, np.true_divide, tf.truediv)
+    self._compareGpu(x, y, np.true_divide, tf.truediv)
+    self._compareGpu(x, y +0.1  , np.floor_divide, tf.floordiv)
 
-  #def testBroadcast(self):
-    #r0, r1 = self._GetGradientArgs([2, 3, 5], [1])
-    #self.assertAllEqual(r0, [])
-    #self.assertAllEqual(r1, [0, 1, 2])
-      
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
index f9b8255204..bd6d34f6a1 100644
--- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
@@ -425,7 +425,7 @@ class Conv3DTest(tf.test.TestCase):
                                   padding="SAME",
                                   test_input=True)
 
-  def testFilterGradientSamePaddingDifferentStrides(self):
+  def disabledtestFilterGradientSamePaddingDifferentStrides(self):
     self.ConstructAndTestGradient(batch=1,
                                   input_planes=5,
                                   input_rows=8,
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index c32149e880..c097177849 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -779,8 +779,9 @@ class BinaryOpTest(tf.test.TestCase):
       x = (1 + np.linspace(0, 5, np.prod(xs))).astype(dtype).reshape(xs)
       y = (1 + np.linspace(0, 5, np.prod(ys))).astype(dtype).reshape(ys)
     self._compareCpu(x, y, np_func, tf_func)
-    if x.dtype in (np.float16, np.float32, np.float64, np.complex64,
-                   np.complex128):
+    if x.dtype in (np.float16, np.float32, np.float64):
+      # TODO(aselle): Make the test work for dtypes:
+      #     (np.complex64, np.complex128).
       if tf_func not in (_FLOORDIV, tf.floordiv):
         if x.dtype == np.float16:
           # Compare fp16 theoretical gradients to fp32 numerical gradients,
diff --git a/tensorflow/python/kernel_tests/decode_image_op_test.py b/tensorflow/python/kernel_tests/decode_image_op_test.py
new file mode 100644
index 0000000000..51cdedcf0f
--- /dev/null
+++ b/tensorflow/python/kernel_tests/decode_image_op_test.py
@@ -0,0 +1,105 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for decode_image."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os.path
+import numpy as np
+import tensorflow as tf
+
+# Double-quote usage here is intentional to make internal path rewriting easier.
+prefix_path = os.path.join("tensorflow", "core", "lib")
+
+class DecodeImageOpTest(tf.test.TestCase):
+
+  def testGif(self):
+    # Read some real GIFs
+    path = os.path.join(prefix_path, 'gif', 'testdata', 'scan.gif')
+    WIDTH = 20
+    HEIGHT = 40
+    STRIDE = 5
+    shape = (12, HEIGHT, WIDTH, 3)
+
+    with self.test_session(use_gpu=True) as sess:
+      gif0 = tf.read_file(path)
+      image0 = tf.image.decode_image(gif0)
+      image1 = tf.image.decode_gif(gif0)
+      gif0, image0, image1 = sess.run([gif0, image0, image1])
+
+      self.assertEqual(image0.shape, shape)
+      self.assertAllEqual(image0, image1)
+
+      for frame_idx, frame in enumerate(image0):
+        gt = np.zeros(shape[1:], dtype=np.uint8)
+        start = frame_idx * STRIDE
+        end = (frame_idx + 1) * STRIDE
+        if end <= WIDTH:
+          gt[:, start:end, :] = 255
+        else:
+          start -= WIDTH
+          end -= WIDTH
+          gt[start:end, :, :] = 255
+
+        self.assertAllClose(frame, gt)
+
+        bad_channels = tf.image.decode_image(gif0, channels=1)
+        with self.assertRaises(tf.errors.InvalidArgumentError):
+          bad_channels.eval()
+
+
+  def testJpeg(self):
+    # Read a real jpeg and verify shape
+    path = os.path.join(prefix_path, 'jpeg', 'testdata', 'jpeg_merge_test1.jpg')
+    with self.test_session(use_gpu=True) as sess:
+      jpeg0 = tf.read_file(path)
+      image0 = tf.image.decode_image(jpeg0)
+      image1 = tf.image.decode_jpeg(jpeg0)
+      jpeg0, image0, image1 = sess.run([jpeg0, image0, image1])
+      self.assertEqual(len(jpeg0), 3771)
+      self.assertEqual(image0.shape, (256, 128, 3))
+      self.assertAllEqual(image0, image1)
+
+  def testPng(self):
+    # Read some real PNGs, converting to different channel numbers
+    inputs = [(1, 'lena_gray.png')]
+    for channels_in, filename in inputs:
+      for channels in 0, 1, 3:
+        with self.test_session(use_gpu=True) as sess:
+          path = os.path.join(prefix_path, 'png', 'testdata', filename)
+          png0 = tf.read_file(path)
+          image0 = tf.image.decode_image(png0, channels=channels)
+          image1 = tf.image.decode_png(png0, channels=channels)
+          png0, image0, image1 = sess.run([png0, image0, image1])
+          self.assertEqual(image0.shape, (26, 51, channels or channels_in))
+          self.assertAllEqual(image0, image1)
+
+  def testInvalidBytes(self):
+    image_bytes = b'ThisIsNotAnImage!'
+    decode = tf.image.decode_image(image_bytes)
+    with self.test_session():
+      with self.assertRaises(tf.errors.InvalidArgumentError):
+        decode.eval()
+
+  def testInvalidChannels(self):
+    image_bytes = b'unused'
+    with self.assertRaises(ValueError):
+      decode = tf.image.decode_image(image_bytes, channels=4)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/python/kernel_tests/io_ops_test.py b/tensorflow/python/kernel_tests/io_ops_test.py
index d484a609fc..b0c46ea07d 100644
--- a/tensorflow/python/kernel_tests/io_ops_test.py
+++ b/tensorflow/python/kernel_tests/io_ops_test.py
@@ -1,4 +1,4 @@
-# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,6 +20,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import tempfile
 
 import tensorflow as tf
@@ -31,25 +32,31 @@ class IoOpsTest(tf.test.TestCase):
     cases = ['', 'Some contents', 'Неки садржаји на српском']
     for contents in cases:
       contents = tf.compat.as_bytes(contents)
-      temp = tempfile.NamedTemporaryFile(
-          prefix='ReadFileTest', dir=self.get_temp_dir())
-      open(temp.name, 'wb').write(contents)
+      with tempfile.NamedTemporaryFile(prefix='ReadFileTest',
+                                       dir=self.get_temp_dir(),
+                                       delete=False) as temp:
+        temp.write(contents)
       with self.test_session():
         read = tf.read_file(temp.name)
         self.assertEqual([], read.get_shape())
         self.assertEqual(read.eval(), contents)
+      os.remove(temp.name)
 
   def testWriteFile(self):
     cases = ['', 'Some contents']
     for contents in cases:
       contents = tf.compat.as_bytes(contents)
-      temp = tempfile.NamedTemporaryFile(
-          prefix='WriteFileTest', dir=self.get_temp_dir())
+      with tempfile.NamedTemporaryFile(prefix='WriteFileTest',
+                                       dir=self.get_temp_dir(),
+                                       delete=False) as temp:
+        pass
       with self.test_session() as sess:
         w = tf.write_file(temp.name, contents)
         sess.run(w)
-        file_contents = open(temp.name, 'rb').read()
+        with open(temp.name, 'rb') as f:
+          file_contents = f.read()
         self.assertEqual(file_contents, contents)
+      os.remove(temp.name)
 
   def _subset(self, files, indices):
     return set(tf.compat.as_bytes(files[i].name)
@@ -59,7 +66,7 @@ class IoOpsTest(tf.test.TestCase):
     cases = ['ABcDEF.GH', 'ABzDEF.GH', 'ABasdfjklDEF.GH', 'AB3DEF.GH',
              'AB4DEF.GH', 'ABDEF.GH', 'XYZ']
     files = [tempfile.NamedTemporaryFile(
-        prefix=c, dir=self.get_temp_dir()) for c in cases]
+        prefix=c, dir=self.get_temp_dir(), delete=True) for c in cases]
 
     with self.test_session():
       # Test exact match without wildcards.
@@ -77,10 +84,16 @@ class IoOpsTest(tf.test.TestCase):
                        self._subset(files, [0, 1, 3, 4]))
       self.assertEqual(set(tf.matching_files(pattern % '*').eval()),
                        self._subset(files, [0, 1, 2, 3, 4, 5]))
-      self.assertEqual(set(tf.matching_files(pattern % '[cxz]').eval()),
-                       self._subset(files, [0, 1]))
-      self.assertEqual(set(tf.matching_files(pattern % '[0-9]').eval()),
-                       self._subset(files, [3, 4]))
+      # NOTE(mrry): Windows uses PathMatchSpec to match file patterns, which
+      # does not support the following expressions.
+      if os.name != 'nt':
+        self.assertEqual(set(tf.matching_files(pattern % '[cxz]').eval()),
+                         self._subset(files, [0, 1]))
+        self.assertEqual(set(tf.matching_files(pattern % '[0-9]').eval()),
+                         self._subset(files, [3, 4]))
+
+    for f in files:
+      f.close()
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/kernel_tests/seq2seq_test.py b/tensorflow/python/kernel_tests/seq2seq_test.py
new file mode 100644
index 0000000000..03b5f68659
--- /dev/null
+++ b/tensorflow/python/kernel_tests/seq2seq_test.py
@@ -0,0 +1,770 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for functional style sequence-to-sequence models."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import random
+
+import numpy as np
+import tensorflow as tf
+
+
+class Seq2SeqTest(tf.test.TestCase):
+
+  def testRNNDecoder(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        inp = [tf.constant(0.5, shape=[2, 2])] * 2
+        _, enc_state = tf.nn.rnn(
+            tf.nn.rnn_cell.GRUCell(2), inp, dtype=tf.float32)
+        dec_inp = [tf.constant(0.4, shape=[2, 2])] * 3
+        cell = tf.nn.rnn_cell.OutputProjectionWrapper(
+            tf.nn.rnn_cell.GRUCell(2), 4)
+        dec, mem = tf.nn.seq2seq.rnn_decoder(dec_inp, enc_state, cell)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 4), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual((2, 2), res[0].shape)
+
+  def testBasicRNNSeq2Seq(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        inp = [tf.constant(0.5, shape=[2, 2])] * 2
+        dec_inp = [tf.constant(0.4, shape=[2, 2])] * 3
+        cell = tf.nn.rnn_cell.OutputProjectionWrapper(
+            tf.nn.rnn_cell.GRUCell(2), 4)
+        dec, mem = tf.nn.seq2seq.basic_rnn_seq2seq(inp, dec_inp, cell)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 4), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual((2, 2), res[0].shape)
+
+  def testTiedRNNSeq2Seq(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        inp = [tf.constant(0.5, shape=[2, 2])] * 2
+        dec_inp = [tf.constant(0.4, shape=[2, 2])] * 3
+        cell = tf.nn.rnn_cell.OutputProjectionWrapper(
+            tf.nn.rnn_cell.GRUCell(2), 4)
+        dec, mem = tf.nn.seq2seq.tied_rnn_seq2seq(inp, dec_inp, cell)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 4), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual(1, len(res))
+        self.assertEqual((2, 2), res[0].shape)
+
+  def testEmbeddingRNNDecoder(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        inp = [tf.constant(0.5, shape=[2, 2])] * 2
+        cell = tf.nn.rnn_cell.BasicLSTMCell(2, state_is_tuple=True)
+        _, enc_state = tf.nn.rnn(cell, inp, dtype=tf.float32)
+        dec_inp = [tf.constant(i, tf.int32, shape=[2]) for i in range(3)]
+        dec, mem = tf.nn.seq2seq.embedding_rnn_decoder(
+            dec_inp, enc_state, cell, num_symbols=4, embedding_size=2)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 2), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual(1, len(res))
+        self.assertEqual((2, 2), res[0].c.shape)
+        self.assertEqual((2, 2), res[0].h.shape)
+
+  def testEmbeddingRNNSeq2Seq(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        enc_inp = [tf.constant(1, tf.int32, shape=[2]) for i in range(2)]
+        dec_inp = [tf.constant(i, tf.int32, shape=[2]) for i in range(3)]
+        cell = tf.nn.rnn_cell.BasicLSTMCell(2, state_is_tuple=True)
+        dec, mem = tf.nn.seq2seq.embedding_rnn_seq2seq(
+            enc_inp, dec_inp, cell, num_encoder_symbols=2,
+            num_decoder_symbols=5, embedding_size=2)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 5), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual((2, 2), res[0].c.shape)
+        self.assertEqual((2, 2), res[0].h.shape)
+
+        # Test with state_is_tuple=False.
+        with tf.variable_scope("no_tuple"):
+          cell1 = tf.nn.rnn_cell.BasicLSTMCell(2, state_is_tuple=False)
+          dec, mem = tf.nn.seq2seq.embedding_rnn_seq2seq(
+              enc_inp, dec_inp, cell1, num_encoder_symbols=2,
+              num_decoder_symbols=5, embedding_size=2)
+          sess.run([tf.global_variables_initializer()])
+          res = sess.run(dec)
+          self.assertEqual(3, len(res))
+          self.assertEqual((2, 5), res[0].shape)
+
+          res = sess.run([mem])
+          self.assertEqual((2, 4), res[0].shape)
+
+        # Test externally provided output projection.
+        w = tf.get_variable("proj_w", [2, 5])
+        b = tf.get_variable("proj_b", [5])
+        with tf.variable_scope("proj_seq2seq"):
+          dec, _ = tf.nn.seq2seq.embedding_rnn_seq2seq(
+              enc_inp, dec_inp, cell, num_encoder_symbols=2,
+              num_decoder_symbols=5, embedding_size=2, output_projection=(w, b))
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 2), res[0].shape)
+
+        # Test that previous-feeding model ignores inputs after the first.
+        dec_inp2 = [tf.constant(0, tf.int32, shape=[2]) for _ in range(3)]
+        with tf.variable_scope("other"):
+          d3, _ = tf.nn.seq2seq.embedding_rnn_seq2seq(
+              enc_inp, dec_inp2, cell, num_encoder_symbols=2,
+              num_decoder_symbols=5, embedding_size=2,
+              feed_previous=tf.constant(True))
+        sess.run([tf.global_variables_initializer()])
+        tf.get_variable_scope().reuse_variables()
+        d1, _ = tf.nn.seq2seq.embedding_rnn_seq2seq(
+            enc_inp, dec_inp, cell, num_encoder_symbols=2,
+            num_decoder_symbols=5, embedding_size=2, feed_previous=True)
+        d2, _ = tf.nn.seq2seq.embedding_rnn_seq2seq(
+            enc_inp, dec_inp2, cell, num_encoder_symbols=2,
+            num_decoder_symbols=5, embedding_size=2, feed_previous=True)
+        res1 = sess.run(d1)
+        res2 = sess.run(d2)
+        res3 = sess.run(d3)
+        self.assertAllClose(res1, res2)
+        self.assertAllClose(res1, res3)
+
+  def testEmbeddingTiedRNNSeq2Seq(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        enc_inp = [tf.constant(1, tf.int32, shape=[2]) for i in range(2)]
+        dec_inp = [tf.constant(i, tf.int32, shape=[2]) for i in range(3)]
+        cell = tf.nn.rnn_cell.BasicLSTMCell(2, state_is_tuple=True)
+        dec, mem = tf.nn.seq2seq.embedding_tied_rnn_seq2seq(
+            enc_inp, dec_inp, cell, num_symbols=5, embedding_size=2)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 5), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual((2, 2), res[0].c.shape)
+        self.assertEqual((2, 2), res[0].h.shape)
+
+        # Test when num_decoder_symbols is provided, the size of decoder output
+        # is num_decoder_symbols.
+        with tf.variable_scope("decoder_symbols_seq2seq"):
+          dec, mem = tf.nn.seq2seq.embedding_tied_rnn_seq2seq(
+              enc_inp, dec_inp, cell, num_symbols=5, num_decoder_symbols=3,
+              embedding_size=2)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 3), res[0].shape)
+
+        # Test externally provided output projection.
+        w = tf.get_variable("proj_w", [2, 5])
+        b = tf.get_variable("proj_b", [5])
+        with tf.variable_scope("proj_seq2seq"):
+          dec, _ = tf.nn.seq2seq.embedding_tied_rnn_seq2seq(
+              enc_inp, dec_inp, cell, num_symbols=5, embedding_size=2,
+              output_projection=(w, b))
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 2), res[0].shape)
+
+        # Test that previous-feeding model ignores inputs after the first.
+        dec_inp2 = [tf.constant(0, tf.int32, shape=[2])] * 3
+        with tf.variable_scope("other"):
+          d3, _ = tf.nn.seq2seq.embedding_tied_rnn_seq2seq(
+              enc_inp, dec_inp2, cell, num_symbols=5, embedding_size=2,
+              feed_previous=tf.constant(True))
+        sess.run([tf.global_variables_initializer()])
+        tf.get_variable_scope().reuse_variables()
+        d1, _ = tf.nn.seq2seq.embedding_tied_rnn_seq2seq(
+            enc_inp, dec_inp, cell, num_symbols=5, embedding_size=2,
+            feed_previous=True)
+        d2, _ = tf.nn.seq2seq.embedding_tied_rnn_seq2seq(
+            enc_inp, dec_inp2, cell, num_symbols=5, embedding_size=2,
+            feed_previous=True)
+        res1 = sess.run(d1)
+        res2 = sess.run(d2)
+        res3 = sess.run(d3)
+        self.assertAllClose(res1, res2)
+        self.assertAllClose(res1, res3)
+
+  def testAttentionDecoder1(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        cell = tf.nn.rnn_cell.GRUCell(2)
+        inp = [tf.constant(0.5, shape=[2, 2])] * 2
+        enc_outputs, enc_state = tf.nn.rnn(cell, inp, dtype=tf.float32)
+        attn_states = tf.concat(1, [tf.reshape(e, [-1, 1, cell.output_size])
+                                    for e in enc_outputs])
+        dec_inp = [tf.constant(0.4, shape=[2, 2])] * 3
+        dec, mem = tf.nn.seq2seq.attention_decoder(
+            dec_inp, enc_state,
+            attn_states, cell, output_size=4)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 4), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual((2, 2), res[0].shape)
+
+  def testAttentionDecoder2(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        cell = tf.nn.rnn_cell.GRUCell(2)
+        inp = [tf.constant(0.5, shape=[2, 2])] * 2
+        enc_outputs, enc_state = tf.nn.rnn(cell, inp, dtype=tf.float32)
+        attn_states = tf.concat(1, [tf.reshape(e, [-1, 1, cell.output_size])
+                                    for e in enc_outputs])
+        dec_inp = [tf.constant(0.4, shape=[2, 2])] * 3
+        dec, mem = tf.nn.seq2seq.attention_decoder(
+            dec_inp, enc_state,
+            attn_states, cell, output_size=4,
+            num_heads=2)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 4), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual((2, 2), res[0].shape)
+
+  def testDynamicAttentionDecoder1(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        cell = tf.nn.rnn_cell.GRUCell(2)
+        inp = tf.constant(0.5, shape=[2, 2, 2])
+        enc_outputs, enc_state = tf.nn.dynamic_rnn(cell, inp, dtype=tf.float32)
+        attn_states = enc_outputs
+        dec_inp = [tf.constant(0.4, shape=[2, 2])] * 3
+        dec, mem = tf.nn.seq2seq.attention_decoder(
+            dec_inp, enc_state,
+            attn_states, cell, output_size=4)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 4), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual((2, 2), res[0].shape)
+
+  def testDynamicAttentionDecoder2(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        cell = tf.nn.rnn_cell.GRUCell(2)
+        inp = tf.constant(0.5, shape=[2, 2, 2])
+        enc_outputs, enc_state = tf.nn.dynamic_rnn(cell, inp, dtype=tf.float32)
+        attn_states = enc_outputs
+        dec_inp = [tf.constant(0.4, shape=[2, 2])] * 3
+        dec, mem = tf.nn.seq2seq.attention_decoder(
+            dec_inp, enc_state,
+            attn_states, cell, output_size=4,
+            num_heads=2)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 4), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual((2, 2), res[0].shape)
+
+  def testAttentionDecoderStateIsTuple(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        cell = tf.nn.rnn_cell.BasicLSTMCell(2, state_is_tuple=True)
+        cell = tf.nn.rnn_cell.MultiRNNCell(cells=[cell] * 2,
+                                           state_is_tuple=True)
+        inp = [tf.constant(0.5, shape=[2, 2])] * 2
+        enc_outputs, enc_state = tf.nn.rnn(cell, inp, dtype=tf.float32)
+        attn_states = tf.concat(1, [tf.reshape(e, [-1, 1, cell.output_size])
+                                    for e in enc_outputs])
+        dec_inp = [tf.constant(0.4, shape=[2, 2])] * 3
+        dec, mem = tf.nn.seq2seq.attention_decoder(
+            dec_inp, enc_state,
+            attn_states, cell, output_size=4)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 4), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual(2, len(res[0]))
+        self.assertEqual((2, 2), res[0][0].c.shape)
+        self.assertEqual((2, 2), res[0][0].h.shape)
+        self.assertEqual((2, 2), res[0][1].c.shape)
+        self.assertEqual((2, 2), res[0][1].h.shape)
+
+    # pylint: disable=unused-variable,invalid-name
+    def testDynamicAttentionDecoderStateIsTuple(self):
+      with self.test_session() as sess:
+        with tf.variable_scope(
+            "root", initializer=tf.constant_initializer(0.5)):
+          cell = tf.nn.rnn_cell.BasicLSTMCell(2, state_is_tuple=True)
+          cell = tf.nn.rnn_cell.MultiRNNCell(cells=[cell] * 2,
+                                             state_is_tuple=True)
+          inp = tf.constant(0.5, shape=[2, 2, 2])
+          enc_outputs, enc_state = tf.nn.rnn(cell, inp, dtype=tf.float32)
+          attn_states = tf.concat(1, [tf.reshape(e, [-1, 1, cell.output_size])
+                                      for e in enc_outputs])
+          dec_inp = [tf.constant(0.4, shape=[2, 2])] * 3
+          dec, mem = tf.nn.seq2seq.attention_decoder(
+              dec_inp, enc_state,
+              attn_states, cell, output_size=4)
+          sess.run([tf.global_variables_initializer()])
+          res = sess.run(dec)
+          self.assertEqual(3, len(res))
+          self.assertEqual((2, 4), res[0].shape)
+
+          res = sess.run([mem])
+          self.assertEqual(2, len(res[0]))
+          self.assertEqual((2, 2), res[0][0].c.shape)
+          self.assertEqual((2, 2), res[0][0].h.shape)
+          self.assertEqual((2, 2), res[0][1].c.shape)
+          self.assertEqual((2, 2), res[0][1].h.shape)
+
+  def testEmbeddingAttentionDecoder(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        inp = [tf.constant(0.5, shape=[2, 2])] * 2
+        cell = tf.nn.rnn_cell.GRUCell(2)
+        enc_outputs, enc_state = tf.nn.rnn(cell, inp, dtype=tf.float32)
+        attn_states = tf.concat(1, [tf.reshape(e, [-1, 1, cell.output_size])
+                                    for e in enc_outputs])
+        dec_inp = [tf.constant(i, tf.int32, shape=[2]) for i in range(3)]
+        dec, mem = tf.nn.seq2seq.embedding_attention_decoder(
+            dec_inp, enc_state, attn_states, cell, num_symbols=4,
+            embedding_size=2, output_size=3)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 3), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual((2, 2), res[0].shape)
+
+  def testEmbeddingAttentionSeq2Seq(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        enc_inp = [tf.constant(1, tf.int32, shape=[2]) for i in range(2)]
+        dec_inp = [tf.constant(i, tf.int32, shape=[2]) for i in range(3)]
+        cell = tf.nn.rnn_cell.BasicLSTMCell(2, state_is_tuple=True)
+        dec, mem = tf.nn.seq2seq.embedding_attention_seq2seq(
+            enc_inp, dec_inp, cell, num_encoder_symbols=2,
+            num_decoder_symbols=5, embedding_size=2)
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 5), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual((2, 2), res[0].c.shape)
+        self.assertEqual((2, 2), res[0].h.shape)
+
+        # Test with state_is_tuple=False.
+        with tf.variable_scope("no_tuple"):
+          cell = tf.nn.rnn_cell.BasicLSTMCell(2, state_is_tuple=False)
+          dec, mem = tf.nn.seq2seq.embedding_attention_seq2seq(
+              enc_inp, dec_inp, cell, num_encoder_symbols=2,
+              num_decoder_symbols=5, embedding_size=2)
+          sess.run([tf.global_variables_initializer()])
+          res = sess.run(dec)
+          self.assertEqual(3, len(res))
+          self.assertEqual((2, 5), res[0].shape)
+
+          res = sess.run([mem])
+          self.assertEqual((2, 4), res[0].shape)
+
+        # Test externally provided output projection.
+        w = tf.get_variable("proj_w", [2, 5])
+        b = tf.get_variable("proj_b", [5])
+        with tf.variable_scope("proj_seq2seq"):
+          dec, _ = tf.nn.seq2seq.embedding_attention_seq2seq(
+              enc_inp, dec_inp, cell, num_encoder_symbols=2,
+              num_decoder_symbols=5, embedding_size=2, output_projection=(w, b))
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 2), res[0].shape)
+
+        # Test that previous-feeding model ignores inputs after the first.
+        dec_inp2 = [tf.constant(0, tf.int32, shape=[2]) for _ in range(3)]
+        with tf.variable_scope("other"):
+          d3, _ = tf.nn.seq2seq.embedding_attention_seq2seq(
+              enc_inp, dec_inp2, cell, num_encoder_symbols=2,
+              num_decoder_symbols=5, embedding_size=2,
+              feed_previous=tf.constant(True))
+        sess.run([tf.global_variables_initializer()])
+        tf.get_variable_scope().reuse_variables()
+        d1, _ = tf.nn.seq2seq.embedding_attention_seq2seq(
+            enc_inp, dec_inp, cell, num_encoder_symbols=2,
+            num_decoder_symbols=5, embedding_size=2, feed_previous=True)
+        d2, _ = tf.nn.seq2seq.embedding_attention_seq2seq(
+            enc_inp, dec_inp2, cell, num_encoder_symbols=2,
+            num_decoder_symbols=5, embedding_size=2, feed_previous=True)
+        res1 = sess.run(d1)
+        res2 = sess.run(d2)
+        res3 = sess.run(d3)
+        self.assertAllClose(res1, res2)
+        self.assertAllClose(res1, res3)
+
+  def testOne2ManyRNNSeq2Seq(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        enc_inp = [tf.constant(1, tf.int32, shape=[2]) for i in range(2)]
+        dec_inp_dict = {}
+        dec_inp_dict["0"] = [
+            tf.constant(i, tf.int32, shape=[2]) for i in range(3)]
+        dec_inp_dict["1"] = [
+            tf.constant(i, tf.int32, shape=[2]) for i in range(4)]
+        dec_symbols_dict = {"0": 5, "1": 6}
+        cell = tf.nn.rnn_cell.BasicLSTMCell(2, state_is_tuple=True)
+        outputs_dict, state_dict = tf.nn.seq2seq.one2many_rnn_seq2seq(
+            enc_inp, dec_inp_dict, cell, 2, dec_symbols_dict, embedding_size=2)
+
+        sess.run([tf.global_variables_initializer()])
+        res = sess.run(outputs_dict["0"])
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 5), res[0].shape)
+        res = sess.run(outputs_dict["1"])
+        self.assertEqual(4, len(res))
+        self.assertEqual((2, 6), res[0].shape)
+        res = sess.run([state_dict["0"]])
+        self.assertEqual((2, 2), res[0].c.shape)
+        self.assertEqual((2, 2), res[0].h.shape)
+        res = sess.run([state_dict["1"]])
+        self.assertEqual((2, 2), res[0].c.shape)
+        self.assertEqual((2, 2), res[0].h.shape)
+
+        # Test that previous-feeding model ignores inputs after the first, i.e.
+        # dec_inp_dict2 has different inputs from dec_inp_dict after the first
+        # time-step.
+        dec_inp_dict2 = {}
+        dec_inp_dict2["0"] = [
+            tf.constant(0, tf.int32, shape=[2]) for _ in range(3)]
+        dec_inp_dict2["1"] = [
+            tf.constant(0, tf.int32, shape=[2]) for _ in range(4)]
+        with tf.variable_scope("other"):
+          outputs_dict3, _ = tf.nn.seq2seq.one2many_rnn_seq2seq(
+              enc_inp, dec_inp_dict2, cell, 2, dec_symbols_dict,
+              embedding_size=2, feed_previous=tf.constant(True))
+        sess.run([tf.global_variables_initializer()])
+        tf.get_variable_scope().reuse_variables()
+        outputs_dict1, _ = tf.nn.seq2seq.one2many_rnn_seq2seq(
+            enc_inp, dec_inp_dict, cell, 2, dec_symbols_dict,
+            embedding_size=2, feed_previous=True)
+        outputs_dict2, _ = tf.nn.seq2seq.one2many_rnn_seq2seq(
+            enc_inp, dec_inp_dict2, cell, 2, dec_symbols_dict,
+            embedding_size=2, feed_previous=True)
+        res1 = sess.run(outputs_dict1["0"])
+        res2 = sess.run(outputs_dict2["0"])
+        res3 = sess.run(outputs_dict3["0"])
+        self.assertAllClose(res1, res2)
+        self.assertAllClose(res1, res3)
+
+  def testSequenceLoss(self):
+    with self.test_session() as sess:
+      logits = [tf.constant(i + 0.5, shape=[2, 5]) for i in range(3)]
+      targets = [tf.constant(i, tf.int32, shape=[2]) for i in range(3)]
+      weights = [tf.constant(1.0, shape=[2]) for i in range(3)]
+
+      average_loss_per_example = tf.nn.seq2seq.sequence_loss(
+          logits, targets, weights,
+          average_across_timesteps=True,
+          average_across_batch=True)
+      res = sess.run(average_loss_per_example)
+      self.assertAllClose(1.60944, res)
+
+      average_loss_per_sequence = tf.nn.seq2seq.sequence_loss(
+          logits, targets, weights,
+          average_across_timesteps=False,
+          average_across_batch=True)
+      res = sess.run(average_loss_per_sequence)
+      self.assertAllClose(4.828314, res)
+
+      total_loss = tf.nn.seq2seq.sequence_loss(
+          logits, targets, weights,
+          average_across_timesteps=False,
+          average_across_batch=False)
+      res = sess.run(total_loss)
+      self.assertAllClose(9.656628, res)
+
+  def testSequenceLossByExample(self):
+    with self.test_session() as sess:
+      output_classes = 5
+      logits = [tf.constant(i + 0.5, shape=[2, output_classes])
+                for i in range(3)]
+      targets = [tf.constant(i, tf.int32, shape=[2]) for i in range(3)]
+      weights = [tf.constant(1.0, shape=[2]) for i in range(3)]
+
+      average_loss_per_example = tf.nn.seq2seq.sequence_loss_by_example(
+          logits, targets, weights,
+          average_across_timesteps=True)
+      res = sess.run(average_loss_per_example)
+      self.assertAllClose(np.asarray([1.609438, 1.609438]), res)
+
+      loss_per_sequence = tf.nn.seq2seq.sequence_loss_by_example(
+          logits, targets, weights,
+          average_across_timesteps=False)
+      res = sess.run(loss_per_sequence)
+      self.assertAllClose(np.asarray([4.828314, 4.828314]), res)
+
+  def testModelWithBucketsScopeAndLoss(self):
+    """Test that variable scope reuse is not reset after model_with_buckets."""
+    classes = 10
+    buckets = [(4, 4), (8, 8)]
+
+    with self.test_session():
+      # Here comes a sample Seq2Seq model using GRU cells.
+      def SampleGRUSeq2Seq(enc_inp, dec_inp, weights, per_example_loss):
+        """Example sequence-to-sequence model that uses GRU cells."""
+        def GRUSeq2Seq(enc_inp, dec_inp):
+          cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.GRUCell(24)] * 2,
+                                             state_is_tuple=True)
+          return tf.nn.seq2seq.embedding_attention_seq2seq(
+              enc_inp, dec_inp, cell, num_encoder_symbols=classes,
+              num_decoder_symbols=classes, embedding_size=24)
+        targets = [dec_inp[i+1] for i in range(len(dec_inp) - 1)] + [0]
+        return tf.nn.seq2seq.model_with_buckets(
+            enc_inp, dec_inp, targets, weights, buckets, GRUSeq2Seq,
+            per_example_loss=per_example_loss)
+
+      # Now we construct the copy model.
+      inp = [tf.placeholder(tf.int32, shape=[None]) for _ in range(8)]
+      out = [tf.placeholder(tf.int32, shape=[None]) for _ in range(8)]
+      weights = [tf.ones_like(inp[0], dtype=tf.float32) for _ in range(8)]
+      with tf.variable_scope("root"):
+        _, losses1 = SampleGRUSeq2Seq(inp, out, weights, per_example_loss=False)
+        # Now check that we did not accidentally set reuse.
+        self.assertEqual(False, tf.get_variable_scope().reuse)
+        # Construct one more model with per-example loss.
+        tf.get_variable_scope().reuse_variables()
+        _, losses2 = SampleGRUSeq2Seq(inp, out, weights, per_example_loss=True)
+        # First loss is scalar, the second one is a 1-dimensinal tensor.
+        self.assertEqual([], losses1[0].get_shape().as_list())
+        self.assertEqual([None], losses2[0].get_shape().as_list())
+
+  def testModelWithBuckets(self):
+    """Larger tests that does full sequence-to-sequence model training."""
+    # We learn to copy 10 symbols in 2 buckets: length 4 and length 8.
+    classes = 10
+    buckets = [(4, 4), (8, 8)]
+    perplexities = [[], []]  # Results for each bucket.
+    tf.set_random_seed(111)
+    random.seed(111)
+    np.random.seed(111)
+
+    with self.test_session() as sess:
+      # We use sampled softmax so we keep output projection separate.
+      w = tf.get_variable("proj_w", [24, classes])
+      w_t = tf.transpose(w)
+      b = tf.get_variable("proj_b", [classes])
+      # Here comes a sample Seq2Seq model using GRU cells.
+      def SampleGRUSeq2Seq(enc_inp, dec_inp, weights):
+        """Example sequence-to-sequence model that uses GRU cells."""
+        def GRUSeq2Seq(enc_inp, dec_inp):
+          cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.GRUCell(24)] * 2,
+                                             state_is_tuple=True)
+          return tf.nn.seq2seq.embedding_attention_seq2seq(
+              enc_inp, dec_inp, cell, num_encoder_symbols=classes,
+              num_decoder_symbols=classes, embedding_size=24,
+              output_projection=(w, b))
+        targets = [dec_inp[i+1] for i in range(len(dec_inp) - 1)] + [0]
+        def SampledLoss(labels, inputs):
+          labels = tf.reshape(labels, [-1, 1])
+          return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, 8, classes)
+        return tf.nn.seq2seq.model_with_buckets(
+            enc_inp, dec_inp, targets, weights, buckets, GRUSeq2Seq,
+            softmax_loss_function=SampledLoss)
+
+      # Now we construct the copy model.
+      batch_size = 8
+      inp = [tf.placeholder(tf.int32, shape=[None]) for _ in range(8)]
+      out = [tf.placeholder(tf.int32, shape=[None]) for _ in range(8)]
+      weights = [tf.ones_like(inp[0], dtype=tf.float32) for _ in range(8)]
+      with tf.variable_scope("root"):
+        _, losses = SampleGRUSeq2Seq(inp, out, weights)
+        updates = []
+        params = tf.global_variables()
+        optimizer = tf.train.AdamOptimizer(0.03, epsilon=1e-5)
+        for i in range(len(buckets)):
+          full_grads = tf.gradients(losses[i], params)
+          grads, _ = tf.clip_by_global_norm(full_grads, 30.0)
+          update = optimizer.apply_gradients(zip(grads, params))
+          updates.append(update)
+        sess.run([tf.global_variables_initializer()])
+      steps = 6
+      for _ in range(steps):
+        bucket = random.choice(np.arange(len(buckets)))
+        length = buckets[bucket][0]
+        i = [np.array([np.random.randint(9) + 1 for _ in range(batch_size)],
+                      dtype=np.int32) for _ in range(length)]
+        # 0 is our "GO" symbol here.
+        o = [np.array([0] * batch_size, dtype=np.int32)] + i
+        feed = {}
+        for i1, i2, o1, o2 in zip(inp[:length], i[:length],
+                                  out[:length], o[:length]):
+          feed[i1.name] = i2
+          feed[o1.name] = o2
+        if length < 8:  # For the 4-bucket, we need the 5th as target.
+          feed[out[length].name] = o[length]
+        res = sess.run([updates[bucket], losses[bucket]], feed)
+        perplexities[bucket].append(math.exp(float(res[1])))
+      for bucket in range(len(buckets)):
+        if len(perplexities[bucket]) > 1:  # Assert that perplexity went down.
+          self.assertLess(perplexities[bucket][-1], perplexities[bucket][0])
+
+  def testModelWithBooleanFeedPrevious(self):
+    """Test the model behavior when feed_previous is True.
+
+    For example, the following two cases have the same effect:
+      - Train `embedding_rnn_seq2seq` with `feed_previous=True`, which contains
+        a `embedding_rnn_decoder` with `feed_previous=True` and
+        `update_embedding_for_previous=True`. The decoder is fed with "<Go>"
+        and outputs "A, B, C".
+      - Train `embedding_rnn_seq2seq` with `feed_previous=False`. The decoder
+        is fed with "<Go>, A, B".
+    """
+    num_encoder_symbols = 3
+    num_decoder_symbols = 5
+    batch_size = 2
+    num_enc_timesteps = 2
+    num_dec_timesteps = 3
+
+    def TestModel(seq2seq):
+      with self.test_session(graph=tf.Graph()) as sess:
+        tf.set_random_seed(111)
+        random.seed(111)
+        np.random.seed(111)
+
+        enc_inp = [tf.constant(i + 1, tf.int32, shape=[batch_size])
+                   for i in range(num_enc_timesteps)]
+        dec_inp_fp_true = [tf.constant(i, tf.int32, shape=[batch_size])
+                           for i in range(num_dec_timesteps)]
+        dec_inp_holder_fp_false = [tf.placeholder(tf.int32, shape=[batch_size])
+                                   for _ in range(num_dec_timesteps)]
+        targets = [tf.constant(i + 1, tf.int32, shape=[batch_size])
+                   for i in range(num_dec_timesteps)]
+        weights = [tf.constant(1.0, shape=[batch_size])
+                   for i in range(num_dec_timesteps)]
+
+        def ForwardBackward(enc_inp, dec_inp, feed_previous):
+          scope_name = "fp_{}".format(feed_previous)
+          with tf.variable_scope(scope_name):
+            dec_op, _ = seq2seq(enc_inp, dec_inp, feed_previous=feed_previous)
+            net_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
+                                              scope_name)
+          optimizer = tf.train.AdamOptimizer(0.03, epsilon=1e-5)
+          update_op = optimizer.minimize(
+              tf.nn.seq2seq.sequence_loss(dec_op, targets, weights),
+              var_list=net_variables)
+          return dec_op, update_op, net_variables
+
+        dec_op_fp_true, update_fp_true, variables_fp_true = ForwardBackward(
+            enc_inp, dec_inp_fp_true, feed_previous=True)
+        _, update_fp_false, variables_fp_false = ForwardBackward(
+            enc_inp, dec_inp_holder_fp_false, feed_previous=False)
+
+        sess.run(tf.global_variables_initializer())
+
+        # We only check consistencies between the variables existing in both
+        # the models with True and False feed_previous. Variables created by
+        # the loop_function in the model with True feed_previous are ignored.
+        v_false_name_dict = {v.name.split("/", 1)[-1]: v
+                             for v in variables_fp_false}
+        matched_variables = [(v, v_false_name_dict[v.name.split("/", 1)[-1]])
+                             for v in variables_fp_true]
+        for v_true, v_false in matched_variables:
+          sess.run(tf.assign(v_false, v_true))
+
+        # Take the symbols generated by the decoder with feed_previous=True as
+        # the true input symbols for the decoder with feed_previous=False.
+        dec_fp_true = sess.run(dec_op_fp_true)
+        output_symbols_fp_true = np.argmax(dec_fp_true, axis=2)
+        dec_inp_fp_false = np.vstack((dec_inp_fp_true[0].eval(),
+                                      output_symbols_fp_true[:-1]))
+        sess.run(update_fp_true)
+        sess.run(update_fp_false,
+                 {holder: inp for holder, inp in zip(dec_inp_holder_fp_false,
+                                                     dec_inp_fp_false)})
+
+        for v_true, v_false in matched_variables:
+          self.assertAllClose(v_true.eval(), v_false.eval())
+
+    def EmbeddingRNNSeq2SeqF(enc_inp, dec_inp, feed_previous):
+      cell = tf.nn.rnn_cell.BasicLSTMCell(2, state_is_tuple=True)
+      return tf.nn.seq2seq.embedding_rnn_seq2seq(
+          enc_inp, dec_inp, cell, num_encoder_symbols,
+          num_decoder_symbols, embedding_size=2, feed_previous=feed_previous)
+
+    def EmbeddingRNNSeq2SeqNoTupleF(enc_inp, dec_inp, feed_previous):
+      cell = tf.nn.rnn_cell.BasicLSTMCell(2, state_is_tuple=False)
+      return tf.nn.seq2seq.embedding_rnn_seq2seq(
+          enc_inp, dec_inp, cell, num_encoder_symbols,
+          num_decoder_symbols, embedding_size=2, feed_previous=feed_previous)
+
+    def EmbeddingTiedRNNSeq2Seq(enc_inp, dec_inp, feed_previous):
+      cell = tf.nn.rnn_cell.BasicLSTMCell(2, state_is_tuple=True)
+      return tf.nn.seq2seq.embedding_tied_rnn_seq2seq(
+          enc_inp, dec_inp, cell, num_decoder_symbols, embedding_size=2,
+          feed_previous=feed_previous)
+
+    def EmbeddingTiedRNNSeq2SeqNoTuple(enc_inp, dec_inp, feed_previous):
+      cell = tf.nn.rnn_cell.BasicLSTMCell(2, state_is_tuple=False)
+      return tf.nn.seq2seq.embedding_tied_rnn_seq2seq(
+          enc_inp, dec_inp, cell, num_decoder_symbols, embedding_size=2,
+          feed_previous=feed_previous)
+
+    def EmbeddingAttentionSeq2Seq(enc_inp, dec_inp, feed_previous):
+      cell = tf.nn.rnn_cell.BasicLSTMCell(2, state_is_tuple=True)
+      return tf.nn.seq2seq.embedding_attention_seq2seq(
+          enc_inp, dec_inp, cell, num_encoder_symbols,
+          num_decoder_symbols, embedding_size=2, feed_previous=feed_previous)
+
+    def EmbeddingAttentionSeq2SeqNoTuple(enc_inp, dec_inp, feed_previous):
+      cell = tf.nn.rnn_cell.BasicLSTMCell(2, state_is_tuple=False)
+      return tf.nn.seq2seq.embedding_attention_seq2seq(
+          enc_inp, dec_inp, cell, num_encoder_symbols,
+          num_decoder_symbols, embedding_size=2, feed_previous=feed_previous)
+
+    for model in (EmbeddingRNNSeq2SeqF, EmbeddingRNNSeq2SeqNoTupleF,
+                  EmbeddingTiedRNNSeq2Seq, EmbeddingTiedRNNSeq2SeqNoTuple,
+                  EmbeddingAttentionSeq2Seq, EmbeddingAttentionSeq2SeqNoTuple):
+      TestModel(model)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 55548e7541..fe764b41ac 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -1731,7 +1731,7 @@ def cond(pred, fn1, fn2, name=None):
     y = tf.constant(5)
     def f1(): return tf.mul(x, 17)
     def f2(): return tf.add(y, 23)
-    r = cond(tf.less(x, y), f1, f2)
+    r = tf.cond(tf.less(x, y), f1, f2)
     # r is set to f1().
     # Operations in f2 (e.g., tf.add) are not executed.
   ```
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index 0a3cb06f49..2a05049857 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -213,8 +213,8 @@ def ctc_beam_search_decoder(inputs, sequence_length, beam_width=100,
   """Performs beam search decoding on the logits given in input.
 
   **Note** The `ctc_greedy_decoder` is a special case of the
-  `ctc_beam_search_decoder` with `top_paths=1` (but that decoder is faster
-  for this special case).
+  `ctc_beam_search_decoder` with `top_paths=1` and `beam_width=1` (but
+  that decoder is faster for this special case).
 
   If `merge_repeated` is `True`, merge repeated classes in the output beams.
   This means that if consecutive entries in a beam are the same,
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index f891b94e2e..aae65b194b 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -63,10 +63,11 @@ def embedding_lookup(params, ids, partition_strategy="mod", name=None,
   tensor. The returned tensor has shape `shape(ids) + shape(params)[1:]`.
 
   Args:
-    params: A list of tensors with the same type and which can be concatenated
-      along dimension 0. Alternatively, a `PartitionedVariable`, created by
-      partitioning along dimension 0.  Each element must be appropriately sized
-      for the given `partition_strategy`.
+    params: A single tensor representing the complete embedding tensor,
+      or a list of P tensors all of same shape except for the first dimension,
+      representing sharded embedding tensors.  Alternatively, a
+      `PartitionedVariable`, created by partitioning along dimension 0. Each
+      element must be appropriately sized for the given `partition_strategy`.
     ids: A `Tensor` with type `int32` or `int64` containing the ids to be looked
       up in `params`.
     partition_strategy: A string specifying the partitioning strategy, relevant
@@ -217,7 +218,8 @@ def embedding_lookup_sparse(params, sp_ids, sp_weights,
     params: A single tensor representing the complete embedding tensor,
       or a list of P tensors all of same shape except for the first dimension,
       representing sharded embedding tensors.  Alternatively, a
-      `PartitionedVariable`, created by partitioning along dimension 0.
+      `PartitionedVariable`, created by partitioning along dimension 0. Each
+      element must be appropriately sized for the given `partition_strategy`.
     sp_ids: N x M SparseTensor of int64 ids (typically from FeatureValueToId),
       where N is typically batch size and M is arbitrary.
     sp_weights: either a SparseTensor of float / double weights, or None to
diff --git a/tensorflow/python/ops/image_ops.py b/tensorflow/python/ops/image_ops.py
index bc39a7b771..f11db98ed3 100644
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@@ -36,6 +36,8 @@ to be stripped from the image and re-attached using slicing ops.
 @@decode_png
 @@encode_png
 
+@@decode_image
+
 ## Resizing
 
 The resizing Ops accept input images as tensors of several types.  They always
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 639df5d845..15694d4b3f 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
@@ -1204,3 +1205,58 @@ def adjust_saturation(image, saturation_factor, name=None):
     rgb_altered = gen_image_ops.hsv_to_rgb(hsv_altered)
 
     return convert_image_dtype(rgb_altered, orig_dtype)
+
+
+def decode_image(contents, channels=None, name=None):
+  """Convenience function for `decode_gif`, `decode_jpeg`, and `decode_png`.
+  Detects whether an image is a GIF, JPEG, or PNG, and performs the appropriate 
+  operation to convert the input bytes `string` into a `Tensor` of type `uint8`.
+
+  Note: `decode_gif` returns a 4-D array `[num_frames, height, width, 3]`, as 
+  opposed to `decode_jpeg` and `decode_png`, which return 3-D arrays 
+  `[height, width, num_channels]`. Make sure to take this into account when 
+  constructing your graph if you are intermixing GIF files with JPEG and/or PNG 
+  files.
+
+  Args:
+    contents: 0-D `string`. The encoded image bytes.
+    channels: An optional `int`. Defaults to `0`. Number of color channels for 
+      the decoded image.
+    name: A name for the operation (optional)
+    
+  Returns:
+    `Tensor` with type `uint8` with shape `[height, width, num_channels]` for 
+      JPEG and PNG images and shape `[num_frames, height, width, 3]` for GIF 
+      images.
+  """
+  with ops.name_scope(name, 'decode_image') as scope: 
+    if channels not in (None, 0, 1, 3):
+      raise ValueError('channels must be in (None, 0, 1, 3)')
+    substr = string_ops.substr(contents, 0, 4)
+
+    def _gif():
+      # Create assert op to check that bytes are GIF decodable
+      is_gif = math_ops.equal(substr, b'\x47\x49\x46\x38', name='is_gif')
+      decode_msg = 'Unable to decode bytes as JPEG, PNG, or GIF'
+      assert_decode = control_flow_ops.Assert(is_gif, [decode_msg])
+      # Create assert to make sure that channels is not set to 1
+      # Already checked above that channels is in (None, 0, 1, 3)
+      gif_channels = 0 if channels is None else channels
+      good_channels = math_ops.not_equal(gif_channels, 1, name='check_channels')
+      channels_msg = 'Channels must be in (None, 0, 3) when decoding GIF images'
+      assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
+      with ops.control_dependencies([assert_decode, assert_channels]):
+        return gen_image_ops.decode_gif(contents)
+    
+    def _png():
+      return gen_image_ops.decode_png(contents, channels)
+    
+    def check_png():
+      is_png = math_ops.equal(substr, b'\211PNG', name='is_png')
+      return control_flow_ops.cond(is_png, _png, _gif, name='cond_png')
+    
+    def _jpeg():
+      return gen_image_ops.decode_jpeg(contents, channels)
+
+    is_jpeg = math_ops.equal(substr, b'\xff\xd8\xff\xe0', name='is_jpeg')
+    return control_flow_ops.cond(is_jpeg, _jpeg, check_png, name='cond_jpeg')
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 7275b674d0..e568cff352 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -371,7 +371,7 @@ class AdjustHueBenchmark(test.Benchmark):
         delta = tf.constant(0.1, dtype=tf.float32)
         outputs = image_ops.adjust_hue(inputs, delta)
         run_op = tf.group(outputs)
-        sess.run(tf.initialize_all_variables())
+        sess.run(tf.global_variables_initializer())
         for i in xrange(warmup_rounds + benchmark_rounds):
           if i == warmup_rounds:
             start = time.time()
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 40749b551c..4901057836 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -26,7 +26,7 @@ from tensorflow.python.ops import gen_logging_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_logging_ops import *
 # pylint: enable=wildcard-import
-
+from tensorflow.python.util.deprecation import deprecated
 
 # The python wrapper for Assert is in control_flow_ops, as the Assert
 # call relies on certain conditionals for its dependencies.  Use
@@ -70,6 +70,11 @@ def _Collect(val, collections, default_collections):
     ops.add_to_collection(key, val)
 
 
+@deprecated(
+    "2016-11-30", "Please switch to tf.summary.histogram. Note that "
+    "tf.summary.histogram uses the node name instead of the tag. "
+    "This means that TensorFlow will automatically de-duplicate summary "
+    "names based on their scope.")
 def histogram_summary(tag, values, collections=None, name=None):
   # pylint: disable=line-too-long
   """Outputs a `Summary` protocol buffer with a histogram.
@@ -104,6 +109,12 @@ def histogram_summary(tag, values, collections=None, name=None):
   return val
 
 
+@deprecated(
+    "2016-11-30", "Please switch to tf.summary.image. Note that "
+    "tf.summary.histogram uses the node name instead of the tag. "
+    "This means that TensorFlow will automatically de-duplicate summary "
+    "names based on the scope they are created in. Also, the max_images "
+    "argument was renamed to max_outputs.")
 def image_summary(tag, tensor, max_images=3, collections=None, name=None):
   # pylint: disable=line-too-long
   """Outputs a `Summary` protocol buffer with images.
@@ -159,6 +170,11 @@ def image_summary(tag, tensor, max_images=3, collections=None, name=None):
   return val
 
 
+@deprecated(
+    "2016-11-30", "Please switch to tf.summary.audio. Note that "
+    "tf.summary.histogram uses the node name instead of the tag. "
+    "This means that TensorFlow will automatically de-duplicate summary "
+    "names based on the scope they are created in.")
 def audio_summary(tag,
                   tensor,
                   sample_rate,
@@ -213,6 +229,7 @@ def audio_summary(tag,
   return val
 
 
+@deprecated("2016-11-30", "Please switch to tf.summary.merge.")
 def merge_summary(inputs, collections=None, name=None):
   # pylint: disable=line-too-long
   """Merges summaries.
@@ -245,6 +262,7 @@ def merge_summary(inputs, collections=None, name=None):
   return val
 
 
+@deprecated("2016-11-30", "Please switch to tf.summary.merge_all.")
 def merge_all_summaries(key=ops.GraphKeys.SUMMARIES):
   """Merges all summaries collected in the default graph.
 
@@ -291,6 +309,13 @@ def get_summary_op():
   return summary_op
 
 
+@deprecated(
+    "2016-11-30", "Please switch to tf.summary.scalar. Note that "
+    "tf.summary.scalar uses the node name instead of the tag. "
+    "This means that TensorFlow will automatically de-duplicate summary "
+    "names based on the scope they are created in. Also, passing a "
+    "tensor or list of tags to a scalar summary op is no longer "
+    "supported.")
 def scalar_summary(tags, values, collections=None, name=None):
   # pylint: disable=line-too-long
   """Outputs a `Summary` protocol buffer with scalar values.
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 4710af0d9f..73aea2c260 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -514,15 +514,15 @@ def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
   with ops.name_scope(name, "sufficient_statistics", [x, shift]):
     x = ops.convert_to_tensor(x, name="x")
     x_shape = x.get_shape()
-    if x_shape.is_fully_defined():
+    if all(x_shape[d].value is not None for d in axes):
       counts = 1
       for d in axes:
         counts *= x_shape[d].value
       counts = constant_op.constant(counts, dtype=x.dtype)
     else:  # shape needs to be inferred at runtime.
-      x_dims = array_ops.gather(array_ops.shape(x), axes)
-      counts = math_ops.cast(
-          math_ops.reduce_prod(x_dims), x.dtype, name="count")
+      x_dims = array_ops.gather(
+          math_ops.cast(array_ops.shape(x), x.dtype), axes)
+      counts = math_ops.reduce_prod(x_dims, name="count")
     if shift is not None:
       shift = ops.convert_to_tensor(shift, name="shift")
       m_ss = math_ops.sub(x, shift)
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index 09955e690c..e600478b42 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -44,7 +44,7 @@ def make_template(name_, func_, create_scope_now_=False, unique_name_=None,
      that are intended to be locals can be created by specifying
      `tf.Variable(..., trainable=false)`.
   * The function may use variable scopes and other templates internally to
-      create and reuse variables, but it shouldn't use `tf.all_variables` to
+      create and reuse variables, but it shouldn't use `tf.global_variables` to
       capture variables that are defined outside of the scope of the function.
   * Internal scopes and variable names should not depend on any arguments that
       are not supplied to `make_template`. In general you will get a ValueError
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 34cdfda2b2..69134f8a74 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -83,7 +83,7 @@ class Variable(object):
   ```
 
   The most common initialization pattern is to use the convenience function
-  `global_variable_initializers()` to add an Op to the graph that initializes
+  `global_variables_initializer()` to add an Op to the graph that initializes
   all the variables. You then run that Op after launching the graph.
 
   ```python
@@ -492,7 +492,7 @@ class Variable(object):
 
     ```python
     v = tf.Variable([1, 2])
-    init = tf.global_variable_initializers()
+    init = tf.global_variables_initializer()
 
     with tf.Session() as sess:
         sess.run(init)
diff --git a/tensorflow/python/platform/test.py b/tensorflow/python/platform/test.py
index 2b05aad537..1bd91dba09 100644
--- a/tensorflow/python/platform/test.py
+++ b/tensorflow/python/platform/test.py
@@ -144,6 +144,7 @@ def gpu_device_name():
       return x.name()
   return ''
 
+
 _allowed_symbols = [
     # We piggy-back googletest documentation.
     'Benchmark',
diff --git a/tensorflow/python/training/adadelta_test.py b/tensorflow/python/training/adadelta_test.py
index 9233b21491..717310f042 100644
--- a/tensorflow/python/training/adadelta_test.py
+++ b/tensorflow/python/training/adadelta_test.py
@@ -91,18 +91,18 @@ class AdadeltaOptimizerTest(tf.test.TestCase):
               for slot_idx in range(2):
                 self.assertAllCloseAccordingToType(
                   np.array([accum, accum], dtype=dtype.as_numpy_dtype()),
-                  slot[slot_idx].eval(), rtol=1e-2, atol=1e-2)
+                  slot[slot_idx].eval(), rtol=1e-3)
 
                 self.assertAllCloseAccordingToType(
                   np.array([accum_update, accum_update],
                   dtype=dtype.as_numpy_dtype()),
-                  slot_update[slot_idx].eval())
+                  slot_update[slot_idx].eval(), rtol=1e-3)
 
               # Check that the parameters have been updated
               self.assertAllCloseAccordingToType(
                 np.array([var0_init[0] - tot_update,
                 var0_init[1] - tot_update], dtype=dtype.as_numpy_dtype()),
-                var0.eval(), rtol=1e-2)
+                var0.eval(), rtol=1e-3)
 
               self.assertAllCloseAccordingToType(
                 np.array([var1_init[0] - tot_update,
diff --git a/tensorflow/python/training/device_setter.py b/tensorflow/python/training/device_setter.py
index 8a1dadccf4..7f403f4927 100644
--- a/tensorflow/python/training/device_setter.py
+++ b/tensorflow/python/training/device_setter.py
@@ -148,7 +148,7 @@ def replica_device_setter(ps_tasks=0, ps_device="/job:ps",
   cluster_spec = {
       "ps": ["ps0:2222", "ps1:2222"],
       "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]}
-  with tf.device(tf.replica_device_setter(cluster=cluster_spec)):
+  with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):
     # Build your graph
     v1 = tf.Variable(...)  # assigned to /job:ps/task:0
     v2 = tf.Variable(...)  # assigned to /job:ps/task:1
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index 32a4b175bb..ff9b0e5ae2 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -331,7 +331,7 @@ class ExponentialMovingAverage(object):
 
     shadow variables are created with `trainable=False` and added to the
     `GraphKeys.ALL_VARIABLES` collection.  They will be returned by calls to
-    `tf.all_variables()`.
+    `tf.global_variables()`.
 
     Returns an op that updates all shadow variables as described above.
 
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index e685bd514f..1c20147c51 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -1207,7 +1207,7 @@ class CheckpointStateTest(tf.test.TestCase):
     train_dir = "train"
     os.mkdir(train_dir)
     abs_path = os.path.join(save_dir, "model-0")
-    rel_path = "train/model-2"
+    rel_path = os.path.join("train", "model-2")
     tf.train.update_checkpoint_state(
         train_dir,
         rel_path,
@@ -1914,7 +1914,7 @@ class ScopedGraphTest(tf.test.TestCase):
         tf.add_to_collection("logits", logits)
 
       # The rest of the variables.
-      rest_variables = list(set(tf.all_variables()) - set(var_list.keys()))
+      rest_variables = list(set(tf.global_variables()) - set(var_list.keys()))
       init_rest_op = tf.initialize_variables(rest_variables)
 
     with self.test_session(graph=graph) as sess:
diff --git a/tensorflow/python/training/session_manager_test.py b/tensorflow/python/training/session_manager_test.py
index fa81f9b83f..1d3dcda119 100644
--- a/tensorflow/python/training/session_manager_test.py
+++ b/tensorflow/python/training/session_manager_test.py
@@ -154,7 +154,7 @@ class SessionManagerTest(tf.test.TestCase):
                                  "you must also pass a local_init_op "):
       tf.train.SessionManager(
           ready_for_local_init_op=tf.report_uninitialized_variables(
-              tf.all_variables()),
+              tf.global_variables()),
           local_init_op=None)
 
   def testRecoverSessionWithReadyForLocalInitOp(self):
@@ -192,7 +192,7 @@ class SessionManagerTest(tf.test.TestCase):
       sm2 = tf.train.SessionManager(
           ready_op=tf.report_uninitialized_variables(),
           ready_for_local_init_op=tf.report_uninitialized_variables(
-              tf.all_variables()),
+              tf.global_variables()),
           local_init_op=w.initializer)
       saver = tf.train.Saver({"v": v})
       sess, initialized = sm2.recover_session(
@@ -348,7 +348,7 @@ class SessionManagerTest(tf.test.TestCase):
           graph=graph,
           ready_op=tf.report_uninitialized_variables(),
           ready_for_local_init_op=tf.report_uninitialized_variables(
-              tf.all_variables()),
+              tf.global_variables()),
           local_init_op=w.initializer)
 
       # Initialize v but not w
@@ -417,7 +417,7 @@ class SessionManagerTest(tf.test.TestCase):
       sm2 = tf.train.SessionManager(
           ready_op=tf.report_uninitialized_variables(),
           ready_for_local_init_op=tf.report_uninitialized_variables(
-              tf.all_variables()),
+              tf.global_variables()),
           local_init_op=w.initializer)
       sess = sm2.prepare_session("", init_op=v.initializer)
       self.assertEqual(
@@ -462,7 +462,7 @@ class SessionManagerTest(tf.test.TestCase):
       sm2 = tf.train.SessionManager(
           ready_op=tf.report_uninitialized_variables(),
           ready_for_local_init_op=tf.report_uninitialized_variables(
-              tf.all_variables()),
+              tf.global_variables()),
           local_init_op=w.initializer)
       with self.assertRaisesRegexp(
           RuntimeError,
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index c631d78fdd..4d1ad44723 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -42,18 +42,29 @@ from __future__ import print_function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_scope
 
 
 def _create_slot_var(primary, val, scope):
   """Helper function for creating a slot variable."""
 
-  slot = variables.Variable(val, name=scope, trainable=False)
+  # TODO(lukaszkaiser): Consider allowing partitioners to be set in the current
+  # scope.
+  current_partitioner = variable_scope.get_variable_scope().partitioner
+  variable_scope.get_variable_scope().set_partitioner(None)
+  slot = variable_scope.get_variable(scope, initializer=val, trainable=False)
+  variable_scope.get_variable_scope().set_partitioner(current_partitioner)
+
   # pylint: disable=protected-access
   if isinstance(primary, variables.Variable) and primary._save_slice_info:
     # Primary is a partitioned variable, so we need to also indicate that
     # the slot is a partitioned variable.  Slots have the same partitioning
     # as their primaries.
-    real_slot_name = scope[len(primary.op.name + "/"):-1]
+    # For examples when using AdamOptimizer in linear model, slot.name
+    # here can be "linear//weights/Adam:0", while primary.op.name is
+    # "linear//weight". We want to get 'Adam' as real_slot_name, so we
+    # remove "'linear//weight' + '/'" and ':0'.
+    real_slot_name = slot.name[len(primary.op.name + "/"):-2]
     slice_info = primary._save_slice_info
     slot._set_save_slice_info(variables.Variable.SaveSliceInfo(
         slice_info.full_name + "/" + real_slot_name,
@@ -80,12 +91,16 @@ def create_slot(primary, val, name, colocate_with_primary=True):
     A `Variable` object.
   """
   # Scope the slot name in the namespace of the primary variable.
-  with ops.name_scope(primary.op.name + "/" + name) as scope:
+  # Set "primary.op.name + '/' + name" as default name, so the scope name of 
+  # optimizer can be shared when reuse is True. Meanwhile when reuse is False
+  # and the same name has been previously used, the scope name will add '_N'
+  # as suffix for unique identifications.
+  with variable_scope.variable_scope(None, primary.op.name + '/' + name):
     if colocate_with_primary:
       with ops.colocate_with(primary):
-        return _create_slot_var(primary, val, scope)
+        return _create_slot_var(primary, val, '')
     else:
-      return _create_slot_var(primary, val, scope)
+      return _create_slot_var(primary, val, '')
 
 
 def create_zeros_slot(primary, name, dtype=None, colocate_with_primary=True):
diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py
index 5aa27eb222..eb8efd17e2 100644
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@@ -248,7 +248,7 @@ class Supervisor(object):
         ready to run the local_init_op.
         The model is considered ready if it returns an empty array.  Defaults to
         the tensor returned from
-        `tf.report_uninitialized_variables(tf.all_variables())`. If `None`, the
+        `tf.report_uninitialized_variables(tf.global_variables())`. If `None`, the
         model is not checked for readiness before running local_init_op.
       is_chief: If True, create a chief supervisor in charge of initializing
         and restoring the model.  If False, create a supervisor that relies
diff --git a/tensorflow/python/training/supervisor_test.py b/tensorflow/python/training/supervisor_test.py
index dda0166aa6..c7c16cdf81 100644
--- a/tensorflow/python/training/supervisor_test.py
+++ b/tensorflow/python/training/supervisor_test.py
@@ -531,7 +531,7 @@ class SupervisorTest(tf.test.TestCase):
               collections=[tf.GraphKeys.LOCAL_VARIABLES],
               name="default_ready_for_local_init_op_w_" + str(uid))
           ready_for_local_init_op = tf.report_uninitialized_variables(
-              tf.all_variables())
+              tf.global_variables())
       sv = tf.train.Supervisor(
           logdir=logdir,
           is_chief=is_chief,
@@ -588,7 +588,7 @@ class SupervisorTest(tf.test.TestCase):
               collections=[tf.GraphKeys.LOCAL_VARIABLES],
               name="ready_for_local_init_op_restore_w_" + str(uid))
           ready_for_local_init_op = tf.report_uninitialized_variables(
-              tf.all_variables())
+              tf.global_variables())
       sv = tf.train.Supervisor(
           logdir=logdir,
           is_chief=is_chief,
@@ -624,7 +624,7 @@ class SupervisorTest(tf.test.TestCase):
 
       # This shouldn't add a variable to the VARIABLES collection responsible
       # for variables that are saved/restored from checkpoints.
-      self.assertEquals(len(tf.all_variables()), 0)
+      self.assertEquals(len(tf.global_variables()), 0)
 
       # Suppress normal variable inits to make sure the local one is
       # initialized via local_init_op.
@@ -644,7 +644,7 @@ class SupervisorTest(tf.test.TestCase):
                         collections=[tf.GraphKeys.LOCAL_VARIABLES])
         # This shouldn't add a variable to the VARIABLES collection responsible
         # for variables that are saved/restored from checkpoints.
-        self.assertEquals(len(tf.all_variables()), 0)
+        self.assertEquals(len(tf.global_variables()), 0)
 
       # Suppress normal variable inits to make sure the local one is
       # initialized via local_init_op.
diff --git a/tensorflow/stream_executor/lib/process_state.cc b/tensorflow/stream_executor/lib/process_state.cc
index be4295b65f..849143b9fe 100644
--- a/tensorflow/stream_executor/lib/process_state.cc
+++ b/tensorflow/stream_executor/lib/process_state.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <direct.h>
 #include <stdlib.h>
 #include <WinSock2.h>
+#pragma comment(lib, "Ws2_32.lib")
 #else
 #include <unistd.h>
 #endif
diff --git a/tensorflow/tensorboard/README.md b/tensorflow/tensorboard/README.md
index 52762b9a8f..b3f6689302 100644
--- a/tensorflow/tensorboard/README.md
+++ b/tensorflow/tensorboard/README.md
@@ -54,18 +54,18 @@ work, but there may be bugs or performance issues.
 
 The first step in using TensorBoard is acquiring data from your TensorFlow run.
 For this, you need [summary
-ops](https://www.tensorflow.org/versions/r0.11/api_docs/python/train.html#summary-operations).
+ops](https://www.tensorflow.org/versions/r0.12/api_docs/python/train.html#summary-operations).
 Summary ops are ops, like
-[`tf.matmul`](https://www.tensorflow.org/versions/r0.11/api_docs/python/math_ops.html#matmul)
+[`tf.matmul`](https://www.tensorflow.org/versions/r0.12/api_docs/python/math_ops.html#matmul)
 or
-[`tf.nn.relu`](https://www.tensorflow.org/versions/r0.11/api_docs/python/nn.html#relu),
+[`tf.nn.relu`](https://www.tensorflow.org/versions/r0.12/api_docs/python/nn.html#relu),
 which means they take in tensors, produce tensors, and are evaluated from within
 a TensorFlow graph. However, summary ops have a twist: the Tensors they produce
 contain serialized protobufs, which are written to disk and sent to TensorBoard.
 To visualize the summary data in TensorBoard, you should evaluate the summary
 op, retrieve the result, and then write that result to disk using a
 summary.FileWriter. A full explanation, with examples, is in [the
-tutorial](https://www.tensorflow.org/versions/r0.11/how_tos/summaries_and_tensorboard/index.html).
+tutorial](https://www.tensorflow.org/versions/r0.12/how_tos/summaries_and_tensorboard/index.html).
 
 ### Tags: Giving names to data
 
@@ -187,7 +187,7 @@ TensorFlow model. To get best use of the graph visualizer, you should use name
 scopes to hierarchically group the ops in your graph - otherwise, the graph may
 be difficult to decipher. For more information, including examples, see [the
 graph visualizer
-tutorial](https://www.tensorflow.org/versions/r0.11/how_tos/graph_viz/index.html#tensorboard-graph-visualization).
+tutorial](https://www.tensorflow.org/versions/r0.12/how_tos/graph_viz/index.html#tensorboard-graph-visualization).
 
 # Frequently Asked Questions
 
diff --git a/tensorflow/tensorboard/backend/server.py b/tensorflow/tensorboard/backend/server.py
index 6f961f1803..7847371bb6 100644
--- a/tensorflow/tensorboard/backend/server.py
+++ b/tensorflow/tensorboard/backend/server.py
@@ -69,7 +69,7 @@ def ParseEventFilesSpec(logdir):
   if logdir is None:
     return files
   # Make sure keeping consistent with ParseURI in core/lib/io/path.cc
-  uri_pattern = re.compile("[a-zA-Z][0-9a-zA-Z.]://.*")
+  uri_pattern = re.compile("[a-zA-Z][0-9a-zA-Z.]*://.*")
   for specification in logdir.split(','):
     # Check if the spec contains group. A spec start with xyz:// is regarded as
     # URI path spec instead of group spec. If the spec looks like /foo:bar/baz,
diff --git a/tensorflow/tensorboard/backend/server_test.py b/tensorflow/tensorboard/backend/server_test.py
index c61953f174..596fff2864 100644
--- a/tensorflow/tensorboard/backend/server_test.py
+++ b/tensorflow/tensorboard/backend/server_test.py
@@ -465,6 +465,11 @@ class ParseEventFilesSpecTest(tf.test.TestCase):
     expected = {'gs://foo/path': None}
     self.assertEqual(server.ParseEventFilesSpec(logdir_string), expected)
 
+  def testRespectsHDFSPath(self):
+    logdir_string = 'hdfs://foo/path'
+    expected = {'hdfs://foo/path': None}
+    self.assertEqual(server.ParseEventFilesSpec(logdir_string), expected)
+
   def testDoesNotExpandUserInGCSPath(self):
     logdir_string = 'gs://~/foo/path'
     expected = {'gs://~/foo/path': None}
diff --git a/tensorflow/tensorboard/dist/index.html b/tensorflow/tensorboard/dist/index.html
index 29c800bea1..66fce9fe9a 100644
--- a/tensorflow/tensorboard/dist/index.html
+++ b/tensorflow/tensorboard/dist/index.html
@@ -21,6 +21,7 @@ limitations under the License.
     <title>TensorBoard</title>
     <script src="webcomponentsjs/webcomponents-lite.min.js"></script>
     <link rel="stylesheet" type="text/css" href="lib/css/global.css">
+    <link rel="stylesheet" type="text/css" href="plottable/plottable.css">
     <link rel="shortcut icon" href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMQAAADECAMAAAD3eH5ZAAAABGdBTUEAALGPC/xhBQAAAAFzUkdCAK7OHOkAAAD/UExURfFlKfaELvFmKfNyK/67NvWALf68Nv69NvNxK/20NfyyNP22NfN0K/JrKvqhMv2zNf25Nf24Nf23NfeOL/yzNPyvNPJoKviWMPmeMfN1K/WBLfePL/FnKfeML/qlMvR7LPmcMfeLL/aJLvR5LPFoKfJuKvR3LP66NvywNPeNL/V/LfaILv21Nf26NfNzK/NvK/R6LPmaMfyxNPqfMvV+LfurM/iSMPmbMfJvKvmdMfumM/qiMvmZMfytNPJqKvysNPN2K/iYMPNwK/upM/JtKvJsKviVMPaHLvaGLvJpKvR8LPaKLvqkMvuqM/aFLvR4LPuoM/iTMPWDLfiRMPmYMXS0ngkAAALoSURBVHja7drnctpAFIbhFUISSKJ3MKYa0+y4xTW9937/15JkJhlTjhrSrHRmvuf/as6L0YLFCgEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMBJ6njenqspzgnPrsrGX9Zpi2tCrmnc6+dYNthVY5WpMmxQLWPdMsOuYVwzNj3ei2t3mQwaV43BJPDCS2NbJ5aEeuX/+9qcjQOtfFIkIkrvY2g4MVcmOBsFWbowKO/kNyj62gRpJcDaPBlxLr1B0zdG0C/8LzbJiJrshuvy1gzlA9+rD8mIkuyIJjFE3/dqnYwoSm7IUEPoD/wut8iIguSIDjlFxe/yfXL5vuSI21BTZLLhXoOILMO8Hxwa/L8bI0LfmUdhGowb2ZvT0e57pFNDgB06IlVyjmmIBl2T/nl9Rw6SD9GgSG/Q0uQkaW3XhmovKQ3eFQ4N2Uo9OQ1eFZsNerf7vP+rO4rhmY1Lg3vFVoP8+8BXg1sFnwbnCk4NThW8GuiKBDdkVVtTNFvNelVsNqTbyWnIOM2oeTRoyWvwmpJHg/ucXBrcJuXT4DwrpwZi2vy0VCx8YtXg/D2bU4OfiuQ3eFfE2KD4bfCqiLNB993gXsGlwa2CT4NzBacGIVQ6YsipQdh0xEdODUKjIxrSp88onZ8zbbFLg1DoiFO5BXvDGv2My9/JhUT8JUZTI0yDaNHLBzIbvqTDNYhUiVw/kdjQ1kM2CHFDPjKW+KzyRTF0g/ga9w9y+fANQpxvX8CU+Ny7FUWDeF3Y+g3lROIf4k0UDX9eCyvO531PyYhHga9zvPZJU5b73Y/eXj8Hv9D48n6HaF5LbcjRt8TZTtda5M1DfXnbkX1C0SHCFKzQB5Fe8op4GNGNHavvZESbVwT5r6W1xyuCPBY3Y9YgDqzknH/e3YfNzzuL30l0IebrZ5kKtuDIXt1n868ET6kf3/49tLvrCcZyF8Pu215dAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAcPIbNrBhOaBXucoAAAAASUVORK5CYII=">
     <link rel="import" href="dist/bazel-html-imports.html">
     <link rel="import" href="dist/tf-tensorboard.html">
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 42753c170b..6cdd02bdca 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -138,6 +138,9 @@ def tf_copts():
               "//tensorflow:windows": [
                 "/DLANG_CXX11",
                 "/D__VERSION__=\\\"MSVC\\\"",
+                "/DPLATFORM_WINDOWS",
+                "/DEIGEN_HAS_C99_MATH",
+                "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
               ],
               "//tensorflow:ios": ["-std=c++11"],
               "//conditions:default": ["-pthread"]}))
@@ -418,7 +421,7 @@ def _cuda_copts():
 # libraries needed by GPU kernels.
 def tf_gpu_kernel_library(srcs, copts=[], cuda_copts=[], deps=[], hdrs=[],
                           **kwargs):
-  copts = copts + _cuda_copts() + if_cuda(cuda_copts)
+  copts = copts + _cuda_copts() + if_cuda(cuda_copts) + tf_copts()
 
   native.cc_library(
       srcs = srcs,
diff --git a/tensorflow/tools/ci_build/builds/libtensorflow.sh b/tensorflow/tools/ci_build/builds/libtensorflow.sh
new file mode 100755
index 0000000000..a9989fe504
--- /dev/null
+++ b/tensorflow/tools/ci_build/builds/libtensorflow.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Script to generate a tarball containing the TensorFlow C-library which
+# consists of the C API header file and libtensorflow.so.
+#
+# Work in progress but this is a step towards a "binary" distribution of the
+# TensorFlow C-library allowing TensorFlow language bindings to be used
+# without having to recompile the TensorFlow framework from sources, which
+# takes a while and also introduces many other dependencies.
+#
+# Usage:
+# - Source this file in another bash script
+# - Execute build_libtensorflow_tarball SUFFIX
+#
+# Produces: lib_package/libtensorflow${SUFFIX}.tar.gz
+#
+# ASSUMPTIONS:
+# - build_libtensorflow_tarball is invoked from the root of the git tree.
+# - Any environment variables needed by the "configure" script have been set.
+
+function build_libtensorflow_tarball() {
+  # Sanity check that this is being run from the root of the git repository.
+  if [ ! -e "WORKSPACE" ]; then
+    echo "Must run this from the root of the bazel workspace"
+    exit 1
+  fi
+  TARBALL_SUFFIX="${1}"
+  BAZEL="bazel --bazelrc ./tensorflow/tools/ci_build/install/.bazelrc"
+  BAZEL_OPTS="-c opt"
+  if [ "${TF_NEED_CUDA}" == "1" ]; then
+    BAZEL_OPTS="${BAZEL_OPTS} --config=cuda"
+  fi
+  bazel clean --expunge
+  yes "" | ./configure
+  
+  # TODO(ashankar): Once 
+  # https://github.com/tensorflow/tensorflow/commit/1b32b698eddc10c0d85b0b8cf838f42023394de7  
+  # can be undone, i.e., when bazel supports pkg_tar with python3+ then all of this below
+  # can be replaced with something like:
+  # bazel build ${BAZEL_OPTS} //tensorflow/tools/lib_package:libtensorflow.tar.gz
+  
+  bazel build ${BAZEL_OPTS} //tensorflow:libtensorflow.so
+  DIR=lib_package
+  rm -rf ${DIR}
+  mkdir -p ${DIR}/build/lib
+  mkdir -p ${DIR}/build/include/tensorflow/c
+  cp bazel-bin/tensorflow/libtensorflow.so ${DIR}/build/lib
+  cp tensorflow/c/c_api.h ${DIR}/build/include/tensorflow/c
+  tar -C ${DIR}/build -cvf ${DIR}/libtensorflow${TARBALL_SUFFIX}.tar.gz include/tensorflow/c/c_api.h lib/libtensorflow.so
+  rm -rf ${DIR}/build
+}
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index be42b38b38..e381603ca8 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -118,6 +118,12 @@ else
   die "Unrecognized container type: \"${CONTAINER_TYPE}\""
 fi
 
+MAC_FLAG=""
+if [[ $(uname) == "Darwin" ]]; then
+  MAC_FLAG="--mac"
+fi
+
+
 # If still in a virtualenv, deactivate it first
 if [[ ! -z "$(which deactivate)" ]]; then
   echo "It appears that we are already in a virtualenv. Deactivating..."
@@ -268,7 +274,7 @@ fi
 
 # Call test_installation.sh to perform test-on-install
 
-"${SCRIPT_DIR}/test_installation.sh" --virtualenv ${GPU_FLAG} ||
+"${SCRIPT_DIR}/test_installation.sh" --virtualenv ${GPU_FLAG} ${MAC_FLAG} ||
     die "PIP tests-on-install FAILED"
 
 # Test user ops
diff --git a/tensorflow/tools/ci_build/builds/test_installation.sh b/tensorflow/tools/ci_build/builds/test_installation.sh
index e376ba51dc..a1e1c22c4d 100755
--- a/tensorflow/tools/ci_build/builds/test_installation.sh
+++ b/tensorflow/tools/ci_build/builds/test_installation.sh
@@ -18,7 +18,7 @@
 # and run the Python unit tests from the source code on the installation
 #
 # Usage:
-#   test_installation.sh [--virtualenv] [--gpu]
+#   test_installation.sh [--virtualenv] [--gpu] [--mac]
 #
 # If the flag --virtualenv is set, the script will use "python" as the Python
 # binary path. Otherwise, it will use tools/python_bin_path.sh to determine
@@ -27,6 +27,9 @@
 # The --gpu flag informs the script that this is a GPU build, so that the
 # appropriate test blacklists can be applied accordingly.
 #
+# The --mac flag informs the script that this is running on mac. Mac does not
+# have flock, so we should skip using parallel_gpu_execute on mac.
+#
 # When executing the Python unit tests, the script obeys the shell
 # variables: PY_TEST_WHITELIST, PY_TEST_BLACKLIST, PY_TEST_GPU_BLACKLIST,
 #
@@ -107,6 +110,7 @@ PY_TEST_BLACKLIST="${PY_TEST_BLACKLIST}:"\
 PY_TEST_GPU_BLACKLIST="${PY_TEST_GPU_BLACKLIST}:"\
 "tensorflow/python/client/session_test.py:"\
 "tensorflow/python/framework/function_test.py:"\
+"tensorflow/contrib/integrate/python/ops/odes_test.py:"\
 "tensorflow/contrib/tensor_forest/python/kernel_tests/scatter_add_ndim_op_test.py"
 
 # Tests that should be run in the exclusive mode (i.e., not parallel with
@@ -135,11 +139,14 @@ TF_GPU_COUNT=${TF_GPU_COUNT:-8}
 # Process input arguments
 IS_VIRTUALENV=0
 IS_GPU=0
+IS_MAC=0
 while true; do
   if [[ "$1" == "--virtualenv" ]]; then
     IS_VIRTUALENV=1
   elif [[ "$1" == "--gpu" ]]; then
     IS_GPU=1
+  elif [[ "$1" == "--mac" ]]; then
+    IS_MAC=1
   fi
   shift
 
@@ -410,7 +417,11 @@ FAILED_TESTS=""
 FAILED_TEST_LOGS=""
 
 if [[ "${IS_GPU}" == "1" ]]; then
-  N_JOBS=$TF_GPU_COUNT
+  if [[ "${IS_MAC}" == "1" ]]; then
+    N_JOBS=1
+  else
+    N_JOBS=$TF_GPU_COUNT
+  fi
 else
   N_JOBS=$(grep -c ^processor /proc/cpuinfo)
   if [[ -z ${N_JOBS} ]]; then
@@ -483,7 +494,9 @@ while true; do
     TEST_LOGS="${TEST_LOGS} ${TEST_LOG}"
 
     # Launch test asynchronously
-    if [[ "${IS_GPU}" == "1" ]]; then
+    if [[ "${IS_GPU}" == "1" ]] && [[ "${IS_MAC}" == "0" ]]; then
+      # Only use this script without mac. This uses flock, which is not
+      # available in MacOSX.
       "${SCRIPT_DIR}/../gpu_build/parallel_gpu_execute.sh" \
         "${SCRIPT_DIR}/py_test_delegate.sh" \
         "${PYTHON_BIN_PATH}" "${PY_TEST_DIR}/${TEST_BASENAME}" "${TEST_LOG}" &
diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh
index 54587effc9..3697fd46a0 100755
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@@ -148,9 +148,9 @@ mkdir -p ${WORKSPACE}/bazel-ci_build-cache
 ${DOCKER_BINARY} run --rm --pid=host \
     -v ${WORKSPACE}/bazel-ci_build-cache:${WORKSPACE}/bazel-ci_build-cache \
     -e "CI_BUILD_HOME=${WORKSPACE}/bazel-ci_build-cache" \
-    -e "CI_BUILD_USER=$(id -u --name)" \
+    -e "CI_BUILD_USER=$(id -u -n)" \
     -e "CI_BUILD_UID=$(id -u)" \
-    -e "CI_BUILD_GROUP=$(id -g --name)" \
+    -e "CI_BUILD_GROUP=$(id -g -n)" \
     -e "CI_BUILD_GID=$(id -g)" \
     -e "CI_TENSORFLOW_SUBMODULE_PATH=${CI_TENSORFLOW_SUBMODULE_PATH}" \
     -v ${WORKSPACE}:/workspace \
diff --git a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
index 0466522943..c33ea2d5cc 100755
--- a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
+++ b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
@@ -37,8 +37,9 @@ for i in `seq 0 $((TF_GPU_COUNT-1))`; do
       echo "Running test $@ on GPU $CUDA_VISIBLE_DEVICES"
       $@
     )
+    return_code=$?
     flock -u "$lock_fd"
-    exit 0
+    exit $return_code
   fi
 done
 
diff --git a/tensorflow/tools/ci_build/linux/libtensorflow.sh b/tensorflow/tools/ci_build/linux/libtensorflow.sh
new file mode 100755
index 0000000000..0013038e8b
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/libtensorflow.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Script to produce a tarball release of the C-library and associated C API
+# header file. Intended to be run inside a docker container. See
+# libtensorflow_docker.sh
+
+set -ex
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# See comments at the top of this file for details.
+source "${SCRIPT_DIR}/../builds/libtensorflow.sh"
+
+SUFFIX="-linux-cpu-"
+if [ "${TF_NEED_CUDA}" == "1" ]; then
+  SUFFIX="-linux-gpu-"
+fi
+
+build_libtensorflow_tarball "${SUFFIX}$(uname -m)"
diff --git a/tensorflow/contrib/seq2seq/python/__init__.py b/tensorflow/tools/ci_build/linux/libtensorflow_cpu.sh
index c5ca3a623f..c300c4670f 100644..100755
--- a/tensorflow/contrib/seq2seq/python/__init__.py
+++ b/tensorflow/tools/ci_build/linux/libtensorflow_cpu.sh
@@ -1,3 +1,4 @@
+#!/usr/bin/env bash
 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""ops module."""
+#
+# Script to build a binary release tarball for the TensorFlow C-library without
+# GPU support.
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+set -ex
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+export TF_NEED_CUDA=0
+"${SCRIPT_DIR}/libtensorflow_docker.sh"
diff --git a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
new file mode 100755
index 0000000000..5423831caa
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Script to produce a tarball release of the C-library and associated C API
+# header file. Builds a docker container and then builds the C-library in
+# said container.
+#
+# See libtensorflow_cpu.sh and libtensorflow_gpu.sh
+
+set -ex
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+DOCKER_CONTEXT_PATH="$(realpath ${SCRIPT_DIR}/..)"
+ROOT_DIR="$(realpath ${SCRIPT_DIR}/../../../../)"
+
+DOCKER_IMAGE="tf-libtensorflow-cpu"
+DOCKER_FILE="Dockerfile.cpu"
+DOCKER_BINARY="docker"
+if [ "${TF_NEED_CUDA}" == "1" ]; then
+	DOCKER_IMAGE="tf-tensorflow-gpu"
+	DOCKER_BINARY="nvidia-docker"
+	DOCKER_FILE="Dockerfile.gpu"
+fi
+
+docker build \
+  -t "${DOCKER_IMAGE}" \
+  -f "${DOCKER_CONTEXT_PATH}/${DOCKER_FILE}" \
+  "${DOCKER_CONTEXT_PATH}"
+
+${DOCKER_BINARY} run \
+  --rm \
+  --pid=host \
+  -v ${ROOT_DIR}:/workspace \
+  -w /workspace \
+  -e "PYTHON_BIN_PATH=/usr/bin/python" \
+  -e "TF_NEED_GCP=0" \
+  -e "TF_NEED_HDFS=0" \
+  -e "TF_NEED_CUDA=${TF_NEED_CUDA}" \
+  -e "TF_NEED_OPENCL=0" \
+  "${DOCKER_IMAGE}" \
+  "/workspace/tensorflow/tools/ci_build/linux/libtensorflow.sh"
diff --git a/tensorflow/contrib/seq2seq/python/ops/layers.py b/tensorflow/tools/ci_build/linux/libtensorflow_gpu.sh
index 4ee2df6073..1879b13b0f 100644..100755
--- a/tensorflow/contrib/seq2seq/python/ops/layers.py
+++ b/tensorflow/tools/ci_build/linux/libtensorflow_gpu.sh
@@ -1,3 +1,4 @@
+#!/usr/bin/env bash
 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,24 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-"""Seq2seq layer operations for use in neural networks.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.ops import array_ops
-
-
-__all__ = ["rnn_decoder",
-           "rnn_decoder_attention"]
-
-
-def rnn_decoder(*args, **kwargs):
-  pass
-
-
-def rnn_decoder_attention(*args, **kwargs):
-  pass
+#
+# Script to build a binary release tarball for the TensorFlow C-library for
+# machines with GPUs.
+set -ex
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+export TF_NEED_CUDA=1
+"${SCRIPT_DIR}/libtensorflow_docker.sh"
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
new file mode 100755
index 0000000000..8b82bad105
--- /dev/null
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Script to produce a tarball release of the C-library and associated C API
+# header file.
+# Produces: lib_package/libtensorflow-gpu-darwin-x86_64.tar.gz
+
+set -ex
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# See comments at the top of this file for details.
+source "${SCRIPT_DIR}/../builds/libtensorflow.sh"
+
+# Configure script
+export PYTHON_BIN_PATH="/usr/bin/python"
+export TF_NEED_GCP=0
+export TF_NEED_HDFS=0
+export TF_NEED_CUDA=0
+export TF_NEED_OPENCL=0
+export COMPUTECPP_PATH="/usr/local"
+
+export PATH="/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin"
+build_libtensorflow_tarball "-darwin-cpu-$(uname -m)"
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
new file mode 100755
index 0000000000..fdb4cc187d
--- /dev/null
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Script to produce a tarball release of the C-library and associated C API
+# header file.
+# Produces: lib_package/libtensorflow-gpu-darwin-x86_64.tar.gz
+
+set -ex
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# See comments at the top of this file for details.
+source "${SCRIPT_DIR}/../builds/libtensorflow.sh"
+
+# Configure script
+export TF_NEED_CUDA=1
+export LD_LIBRARY_PATH="/usr/local/cuda/lib:/usr/local/cuda/extras/CUPTI/lib:${LD_LIBRARY_PATH}"
+export PYTHON_BIN_PATH="/usr/bin/python"
+export TF_NEED_GCP=0
+export TF_NEED_HDFS=0
+export TF_NEED_OPENCL=0
+export COMPUTECPP_PATH="/usr/local"
+
+export PATH="/usr/local/cuda/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin"
+build_libtensorflow_tarball "-darwin-gpu-$(uname -m)"
diff --git a/tensorflow/tools/ci_build/update_version.sh b/tensorflow/tools/ci_build/update_version.sh
index c1b48dd8cb..759c7e5f7e 100755
--- a/tensorflow/tools/ci_build/update_version.sh
+++ b/tensorflow/tools/ci_build/update_version.sh
@@ -91,16 +91,6 @@ check_existence file "${CMAKE_SETUP_PY}"
 
 sed -i -e "s/^\_VERSION = [\'\"].*-cmake-experimental[\'\"]/\_VERSION = \'${MAJOR}.${MINOR}.${PATCH}-cmake-experimental\'/g" "${CMAKE_SETUP_PY}"
 
-# Update Dockerfiles in tensorflow/tools/docker/
-TOOLS_DOCKER_DIR="${TF_SRC_DIR}/tools/docker"
-check_existence dir "${TOOLS_DOCKER_DIR}"
-
-# Determine the files that need to be modified
-DOCKERFILES=$(grep -lrE "^ENV TENSORFLOW_VERSION .+" ${TOOLS_DOCKER_DIR})
-for DOCKERF in ${DOCKERFILES}; do
-  sed -i -r -e "s/^ENV TENSORFLOW_VERSION .+/ENV TENSORFLOW_VERSION ${MAJOR}.${MINOR}.${PATCH}/g" "${DOCKERF}"
-done
-
 
 # Update os_setup.md
 OS_SETUP="${TF_SRC_DIR}/g3doc/get_started/os_setup.md"
diff --git a/tensorflow/tools/ci_build/windows/cpu/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/cpu/bazel/common_env.sh
new file mode 100644
index 0000000000..6e7e555065
--- /dev/null
+++ b/tensorflow/tools/ci_build/windows/cpu/bazel/common_env.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# This script assumes the standard setup on tensorflow Jenkins windows machines.
+# It is NOT guaranteed to work on any other machine. Use at your own risk!
+#
+# REQUIREMENTS:
+# * All installed in standard locations:
+#   - JDK8, and JAVA_HOME set.
+#   - Microsoft Visual Studio 2015 Community Edition
+#   - Msys2
+#   - Anaconda3
+# * Bazel windows executable copied as "bazel.exe" and included in PATH.
+
+# All commands shall pass, and all should be visible.
+set -x
+set -e
+
+# Use a temporary directory with a short name.
+export TMPDIR="C:/tmp"
+mkdir -p "$TMPDIR"
+
+# Set bash path
+export BAZEL_SH="C:/tools/msys64/usr/bin/bash"
+
+# Set Python path for ./configure
+export PYTHON_BIN_PATH="C:/Program Files/Anaconda3/python"
+
+# Set Python path for cc_configure.bzl
+export BAZEL_PYTHON="C:/Program Files/Anaconda3/python"
+
+# Set Visual Studio path
+export BAZEL_VS="C:/Program Files (x86)/Microsoft Visual Studio 14.0"
+
+# Add python into PATH, it's needed because gen_git_source.py uses
+# '/usr/bin/env python' as a shebang
+export PATH="/c/Program Files/Anaconda3:$PATH"
diff --git a/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.bat b/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.bat
new file mode 100644
index 0000000000..99aea4278b
--- /dev/null
+++ b/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.bat
@@ -0,0 +1 @@
+c:\tools\msys64\usr\bin\bash -l %cd%/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh %*
diff --git a/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh b/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh
new file mode 100644
index 0000000000..3e882656a9
--- /dev/null
+++ b/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh
@@ -0,0 +1,140 @@
+#!/bin/bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# This script assumes the standard setup on tensorflow Jenkins windows machines.
+# It is NOT guaranteed to work on any other machine. Use at your own risk!
+#
+# REQUIREMENTS:
+# * All installed in standard locations:
+#   - JDK8, and JAVA_HOME set.
+#   - Microsoft Visual Studio 2015 Community Edition
+#   - Msys2
+#   - Anaconda3
+# * Bazel windows executable copied as "bazel.exe" and included in PATH.
+
+# All commands shall pass, and all should be visible.
+set -x
+set -e
+
+# This script is under <repo_root>/tensorflow/tools/ci_build/windows/cpu/bazel
+# Change into repository root.
+script_dir=$(dirname $0)
+cd ${script_dir%%tensorflow/tools/ci_build/windows/cpu/bazel}.
+
+# Setting up the environment variables Bazel and ./configure needs
+source "tensorflow/tools/ci_build/windows/cpu/bazel/common_env.sh" \
+  || { echo "Failed to source common_env.sh" >&2; exit 1; }
+
+# bazel clean --expunge doesn't work on Windows yet.
+# Clean the output base manually to ensure build correctness
+bazel clean
+output_base=$(bazel info output_base)
+bazel shutdown
+# Sleep 5s to wait for jvm shutdown completely
+# otherwise rm will fail with device or resource busy error
+sleep 5
+rm -rf ${output_base}
+
+export TF_NEED_CUDA=0
+echo "" | ./configure
+
+failing_tests="\
+    //tensorflow/core:example_example_parser_configuration_test + \
+    //tensorflow/core/kernels:sparse_dense_binary_op_shared_test + \
+    //tensorflow/core/kernels:sparse_reduce_sum_op_test + \
+    //tensorflow/core:lib_core_status_test + \
+    //tensorflow/core:lib_monitoring_collection_registry_test + \
+    //tensorflow/core:lib_strings_numbers_test + \
+    //tensorflow/core:lib_strings_str_util_test + \
+    //tensorflow/core/platform/hadoop:hadoop_file_system_test + \
+    //tensorflow/core:platform_file_system_test + \
+    //tensorflow/core:platform_logging_test + \
+    //tensorflow/core:util_sparse_sparse_tensor_test + \
+    //tensorflow/cc:framework_gradient_checker_test + \
+    //tensorflow/cc:framework_gradients_test + \
+    //tensorflow/cc:gradients_array_grad_test + \
+    //tensorflow/cc:gradients_math_grad_test + \
+    //tensorflow/cc:gradients_nn_grad_test + \
+    //tensorflow/cc/saved_model:loader_test
+"
+
+broken_tests="\
+    //tensorflow/cc:framework_cc_ops_test + \
+    //tensorflow/core/platform/cloud:time_util_test + \
+    //tensorflow/core/platform/cloud:oauth_client_test + \
+    //tensorflow/core/platform/cloud:http_request_test + \
+    //tensorflow/core/platform/cloud:google_auth_provider_test + \
+    //tensorflow/core/platform/cloud:gcs_file_system_test + \
+    //tensorflow/core/kernels/cloud:bigquery_table_accessor_test + \
+    //tensorflow/core/kernels/hexagon:quantized_matmul_op_for_hexagon_test + \
+    //tensorflow/core/kernels:sparse_add_op_test + \
+    //tensorflow/core/kernels:spacetobatch_benchmark_test_gpu + \
+    //tensorflow/core/kernels:spacetobatch_benchmark_test + \
+    //tensorflow/core/kernels:requantize_op_test + \
+    //tensorflow/core/kernels:requantization_range_op_test + \
+    //tensorflow/core/kernels:quantized_reshape_op_test + \
+    //tensorflow/core/kernels:quantized_pooling_ops_test + \
+    //tensorflow/core/kernels:quantized_matmul_op_test + \
+    //tensorflow/core/kernels:quantized_conv_ops_test + \
+    //tensorflow/core/kernels:quantized_concat_op_test + \
+    //tensorflow/core/kernels:quantized_bias_add_op_test + \
+    //tensorflow/core/kernels:quantized_batch_norm_op_test + \
+    //tensorflow/core/kernels:quantized_activation_ops_test + \
+    //tensorflow/core/kernels:quantize_op_test + \
+    //tensorflow/core/kernels:quantize_down_and_shrink_range_op_test + \
+    //tensorflow/core/kernels:quantize_and_dequantize_op_test_gpu + \
+    //tensorflow/core/kernels:quantize_and_dequantize_op_test + \
+    //tensorflow/core/kernels:quantization_utils_test + \
+    //tensorflow/core/kernels:debug_ops_test + \
+    //tensorflow/core/kernels:control_flow_ops_test + \
+    //tensorflow/core/kernels:cast_op_test_gpu + \
+    //tensorflow/core/kernels:cast_op_test + \
+    //tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr_test_gpu + \
+    //tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr_test + \
+    //tensorflow/core/distributed_runtime/rpc:grpc_tensor_coding_test + \
+    //tensorflow/core/distributed_runtime/rpc:grpc_session_test_gpu + \
+    //tensorflow/core/distributed_runtime/rpc:grpc_session_test + \
+    //tensorflow/core/distributed_runtime/rpc:grpc_channel_test_gpu + \
+    //tensorflow/core/distributed_runtime/rpc:grpc_channel_test + \
+    //tensorflow/core/distributed_runtime:remote_device_test_gpu + \
+    //tensorflow/core/distributed_runtime:remote_device_test + \
+    //tensorflow/core/distributed_runtime:executor_test_gpu + \
+    //tensorflow/core/distributed_runtime:executor_test + \
+    //tensorflow/core/debug:debug_gateway_test + \
+    //tensorflow/core/debug:debug_grpc_io_utils_test + \
+    //tensorflow/core:util_reporter_test + \
+    //tensorflow/core:util_memmapped_file_system_test + \
+    //tensorflow/core:platform_subprocess_test + \
+    //tensorflow/core:platform_profile_utils_cpu_utils_test + \
+    //tensorflow/core:platform_port_test + \
+    //tensorflow/core:lib_strings_strcat_test + \
+    //tensorflow/core:lib_jpeg_jpeg_mem_unittest + \
+    //tensorflow/core:lib_core_notification_test + \
+    //tensorflow/core:framework_partial_tensor_shape_test + \
+    //tensorflow/core/debug:debug_io_utils_test \
+"
+
+exclude_tests="${failing_tests} + ${broken_tests}"
+
+BUILD_OPTS='-c opt --cpu=x64_windows_msvc --host_cpu=x64_windows_msvc --copt=/w --verbose_failures --experimental_ui'
+
+# Find all the passing cc_tests on Windows and store them in a variable
+passing_tests=$(bazel query "kind(cc_test, //tensorflow/cc/... + //tensorflow/core/...) - (${exclude_tests})" |
+  # We need to strip \r so that the result could be store into a variable under MSYS
+  tr '\r' ' ')
+
+bazel test $BUILD_OPTS -k $passing_tests
+
diff --git a/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
index 45ddfaee9c..11178a5d14 100644
--- a/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
+++ b/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
@@ -22,16 +22,25 @@ CD %BUILD_DIR%
 SET BUILD_CC_TESTS=OFF
 SET BUILD_PYTHON_TESTS=ON
 
+SET PIP_EXE="C:\Program Files\Anaconda3\Scripts\pip.exe"
+
 :: Run the CMAKE build to build the pip package.
 CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\cpu\cmake\run_build.bat
+if %errorlevel% neq 0 exit /b %errorlevel%
 
-SET PIP_EXE="C:\Program Files\Anaconda3\Scripts\pip.exe"
+:: Attempt to upgrade PIP to work around Anaconda issue #542.
+%PIP_EXE% install --ignore-installed --upgrade pip setuptools -v -v
 
-:: Uninstall tensorflow pip package, which might be a leftover from old runs.
-%PIP_EXE% uninstall -y tensorflow
+:: Since there are no wildcards in windows command prompt, use dark magic to get the wheel file name.
+DIR %REPO_ROOT%\%BUILD_DIR%\tf_python\dist\ /S /B > wheel_filename_file
+set /p WHEEL_FILENAME=<wheel_filename_file
+del wheel_filename_file
 
 :: Install the pip package.
-%PIP_EXE% install --upgrade %REPO_ROOT%\%BUILD_DIR%\tf_python\dist\tensorflow-0.11.0rc2_cmake_experimental-py3-none-any.whl
+echo Installing PIP package...
+%PIP_EXE% install --upgrade %WHEEL_FILENAME% -v -v
+if %errorlevel% neq 0 exit /b %errorlevel%
 
-:: Run all python tests
+:: Run all python tests if the installation succeeded.
+echo Running tests...
 ctest -C Release --output-on-failure
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 706cb1628f..28c7475184 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -32,26 +32,11 @@ set -e
 # This script is under <repo_root>/tensorflow/tools/ci_build/windows/cpu/pip/
 # Change into repository root.
 script_dir=$(dirname $0)
-cd ${script_dir%%tensorflow/tools/ci_build/windows/cpu/pip}
+cd ${script_dir%%tensorflow/tools/ci_build/windows/cpu/pip}.
 
-# Use a temporary directory with a short name.
-export TMPDIR="C:/tmp"
-
-# Set bash path
-export BAZEL_SH="C:/tools/msys64/usr/bin/bash"
-
-# Set Python path for ./configure
-export PYTHON_BIN_PATH="C:/Program Files/Anaconda3/python"
-
-# Set Python path for cc_configure.bzl
-export BAZEL_PYTHON="C:/Program Files/Anaconda3/python"
-
-# Set Visual Studio path
-export BAZEL_VS="C:/Program Files (x86)/Microsoft Visual Studio 14.0"
-
-# Add python into PATH, it's needed because gen_git_source.py uses
-# '/usr/bin/env python' as a shebang
-export PATH="/c/Program Files/Anaconda3:$PATH"
+# Setting up the environment variables Bazel and ./configure needs
+source "tensorflow/tools/ci_build/windows/cpu/bazel/common_env.sh" \
+  || { echo "Failed to source common_env.sh" >&2; exit 1; }
 
 # bazel clean --expunge doesn't work on Windows yet.
 # Clean the output base manually to ensure build correctness
@@ -63,11 +48,12 @@ bazel shutdown
 sleep 5
 rm -rf ${output_base}
 
+export TF_NEED_CUDA=0
 echo "" | ./configure
 
-bazel build -c opt --cpu=x64_windows_msvc --host_cpu=x64_windows_msvc\
-    --copt="/w" --verbose_failures --experimental_ui\
-      tensorflow/tools/pip_package:build_pip_package || exit $?
+BUILD_OPTS='-c opt --cpu=x64_windows_msvc --host_cpu=x64_windows_msvc --copt=/w --verbose_failures --experimental_ui'
 
+bazel build $BUILD_OPTS tensorflow/tools/pip_package:build_pip_package || exit $?
 
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package $PWD
+
diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
new file mode 100644
index 0000000000..f124012edc
--- /dev/null
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
@@ -0,0 +1,39 @@
+:: This script assumes the standard setup on tensorflow Jenkins windows machines.
+:: It is NOT guaranteed to work on any other machine. Use at your own risk!
+::
+:: REQUIREMENTS:
+:: * All installed in standard locations:
+::   - JDK8, and JAVA_HOME set.
+::   - Microsoft Visual Studio 2015 Community Edition
+::   - Msys2
+::   - Anaconda3
+::   - CMake
+:: * Before running this script, you have to set BUILD_CC_TESTS and BUILD_PYTHON_TESTS
+::   variables to either "ON" or "OFF".
+:: * Either have the REPO_ROOT variable set, or run this from the repository root directory.
+
+:: Check and set REPO_ROOT
+IF [%REPO_ROOT%] == [] (
+  SET REPO_ROOT=%cd%
+)
+
+:: Import all bunch of variables Visual Studio needs.
+CALL "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat"
+:: Turn echo back on, above script turns it off.
+ECHO ON
+
+:: Some common variables to be shared between runs.
+SET CMAKE_EXE="C:\Program Files\cmake\bin\cmake.exe"
+SET SWIG_EXE="C:\swigwin-3.0.10\swig.exe"
+SET PY_EXE="C:\Program Files\Anaconda3\python.exe"
+SET PY_LIB="C:\Program Files\Anaconda3\libs\python35.lib"
+SET CUDNN_HOME="c:\tools\cuda"
+
+SET CMAKE_DIR=%REPO_ROOT%\tensorflow\contrib\cmake
+SET MSBUILD_EXE="C:\Program Files (x86)\MSBuild\14.0\Bin\msbuild.exe"
+
+:: Run cmake to create Visual Studio Project files.
+%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_ENABLE_GPU=ON -DCUDNN_HOME=%CUDNN_HOME%
+
+:: Run msbuild in the resulting VS project files to build a pip package.
+%MSBUILD_EXE% /p:Configuration=Release /maxcpucount:32 tf_python_build_pip_package.vcxproj
diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
new file mode 100644
index 0000000000..9307ebb66b
--- /dev/null
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
@@ -0,0 +1,46 @@
+:: This script assumes the standard setup on tensorflow Jenkins windows machines.
+:: It is NOT guaranteed to work on any other machine. Use at your own risk!
+::
+:: REQUIREMENTS:
+:: * All installed in standard locations:
+::   - JDK8, and JAVA_HOME set.
+::   - Microsoft Visual Studio 2015 Community Edition
+::   - Msys2
+::   - Anaconda3
+::   - CMake
+
+:: Record the directory we are in. Script should be invoked from the root of the repository.
+SET REPO_ROOT=%cd%
+
+:: Make sure we have a clean directory to build things in.
+SET BUILD_DIR=cmake_build
+RMDIR %BUILD_DIR% /S /Q
+MKDIR %BUILD_DIR%
+CD %BUILD_DIR%
+
+:: Set which tests to build
+SET BUILD_CC_TESTS=OFF
+SET BUILD_PYTHON_TESTS=ON
+
+SET PIP_EXE="C:\Program Files\Anaconda3\Scripts\pip.exe"
+
+:: Run the CMAKE build to build the pip package.
+CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\gpu\cmake\run_build.bat
+if %errorlevel% neq 0 exit /b %errorlevel%
+
+:: Attempt to upgrade PIP to work around Anaconda issue #542.
+%PIP_EXE% install --ignore-installed --upgrade pip setuptools -v -v
+
+:: Since there are no wildcards in windows command prompt, use dark magic to get the wheel file name.
+DIR %REPO_ROOT%\%BUILD_DIR%\tf_python\dist\ /S /B > wheel_filename_file
+set /p WHEEL_FILENAME=<wheel_filename_file
+del wheel_filename_file
+
+:: Install the pip package.
+echo Installing PIP package...
+%PIP_EXE% install --upgrade %WHEEL_FILENAME% -v -v
+if %errorlevel% neq 0 exit /b %errorlevel%
+
+:: Run all python tests if the installation succeeded.
+echo Running tests...
+ctest -C Release --output-on-failure --jobs 1
diff --git a/tensorflow/tools/docker/Dockerfile b/tensorflow/tools/docker/Dockerfile
index 82973226fc..a9852586e9 100644
--- a/tensorflow/tools/docker/Dockerfile
+++ b/tensorflow/tools/docker/Dockerfile
@@ -30,11 +30,10 @@ RUN pip --no-cache-dir install \
         numpy \
         scipy \
         sklearn \
+        Pillow \
         && \
     python -m ipykernel.kernelspec
 
-ENV TENSORFLOW_VERSION 0.11.0
-
 # --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 # These lines will be edited automatically by parameterized_docker_build.sh. #
 # COPY _PIP_FILE_ /
@@ -43,9 +42,11 @@ ENV TENSORFLOW_VERSION 0.11.0
 
 # Install TensorFlow CPU version from central repo
 RUN pip --no-cache-dir install \
-    http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-${TENSORFLOW_VERSION}-cp27-none-linux_x86_64.whl
+    http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.0.0-cp27-none-linux_x86_64.whl
 # --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 
+# RUN ln -s /usr/bin/python3 /usr/bin/python#
+
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
 
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 2110dc34f9..fdc36aef3f 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -82,17 +82,19 @@ RUN mkdir /bazel && \
 
 RUN git clone https://github.com/tensorflow/tensorflow.git && \
     cd tensorflow && \
-    git checkout r0.11
+    git checkout r0.12
 WORKDIR /tensorflow
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
 # the built directory to the path.
 
+ENV CI_BUILD_PYTHON python
+
 RUN tensorflow/tools/ci_build/builds/configured CPU \
     bazel build -c opt tensorflow/tools/pip_package:build_pip_package && \
     bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \
-    pip install --upgrade /tmp/pip/tensorflow-*.whl && \
+    pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \
     rm -rf /tmp/pip && \
     rm -rf /root/.cache
 # Clean up pip wheel and Bazel cache when done.
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index fe6da41a45..ded0539f89 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -11,7 +11,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
-        python \
         python-dev \
         rsync \
         software-properties-common \
@@ -83,10 +82,11 @@ RUN mkdir /bazel && \
 
 RUN git clone https://github.com/tensorflow/tensorflow.git && \
     cd tensorflow && \
-    git checkout r0.11
+    git checkout r0.12
 WORKDIR /tensorflow
 
 # Configure the build for our CUDA configuration.
+ENV CI_BUILD_PYTHON python
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
 ENV TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2
@@ -94,7 +94,7 @@ ENV TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2
 RUN tensorflow/tools/ci_build/builds/configured GPU \
     bazel build -c opt --config=cuda tensorflow/tools/pip_package:build_pip_package && \
     bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \
-    pip install --upgrade /tmp/pip/tensorflow-*.whl && \
+    pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \
     rm -rf /tmp/pip && \
     rm -rf /root/.cache
 # Clean up pip wheel and Bazel cache when done.
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index 8c88c323e2..ca3252e1d9 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -30,11 +30,10 @@ RUN pip --no-cache-dir install \
         numpy \
         scipy \
         sklearn \
+        Pillow \
         && \
     python -m ipykernel.kernelspec
 
-ENV TENSORFLOW_VERSION 0.11.0
-
 # --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 # These lines will be edited automatically by parameterized_docker_build.sh. #
 # COPY _PIP_FILE_ /
@@ -43,9 +42,11 @@ ENV TENSORFLOW_VERSION 0.11.0
 
 # Install TensorFlow GPU version.
 RUN pip --no-cache-dir install \
-    http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-${TENSORFLOW_VERSION}-cp27-none-linux_x86_64.whl
+    http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.0.0-cp27-none-linux_x86_64.whl
 # --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 
+# RUN ln -s /usr/bin/python3 /usr/bin/python#
+
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
 
diff --git a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
index 703ddc4e79..797f0eb4f9 100644
--- a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
+++ b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
@@ -145,7 +145,7 @@
     "    if not os.path.exists(filepath):\n",
     "        filepath, _ = urlretrieve(SOURCE_URL + filename, filepath)\n",
     "        statinfo = os.stat(filepath)\n",
-    "        print('Succesfully downloaded', filename, statinfo.st_size, 'bytes.')\n",
+    "        print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')\n",
     "    else:\n",
     "        print('Already downloaded', filename)\n",
     "    return filepath\n",
diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index 8d6099547c..b0c56c8965 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -31,9 +31,8 @@
 #
 #   TF_DOCKER_BUILD_CENTRAL_PIP
 #     (Optional)
-#     If set to any non-0 and non-empty value, will attempt to use the PIP file
-#     located on the central repo, instead of locally built pip files.
-#     This option takes effect only for non-devel builds.
+#     If set to a non-empty string, will use it as the URL from which the
+#     pip wheel file will be downloaded (instead of building the pip locally).
 #
 #   TF_DOCKER_BUILD_IMAGE_NAME:
 #     (Optional)
@@ -81,7 +80,6 @@ mark_check_failed() {
 
 TF_DOCKER_BUILD_TYPE=$(to_lower ${TF_DOCKER_BUILD_TYPE})
 TF_DOCKER_BUILD_IS_DEVEL=$(to_lower ${TF_DOCKER_BUILD_IS_DEVEL})
-TF_DOCKER_BUILD_CENTRAL_PIP=$(to_lower ${TF_DOCKER_BUILD_CENTRAL_PIP})
 TF_DOCKER_BUILD_PYTHON_VERSION=$(to_lower ${TF_DOCKER_BUILD_PYTHON_VERSION:-PYTHON2})
 TF_DOCKER_BUILD_OPTIONS=$(to_lower ${TF_DOCKER_BUILD_OPTIONS:-OPT})
 
@@ -144,6 +142,15 @@ else
 "${TF_DOCKER_BUILD_TYPE}"
 fi
 
+if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python2" ]]; then
+  :
+elif [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3" ]]; then
+  FINAL_TAG="${FINAL_TAG}-py3"
+else
+  die "Unrecognized value in TF_DOCKER_BUILD_PYTHON_VERSION: "\
+"${TF_DOCKER_BUILD_PYTHON_VERSION}"
+fi
+
 # Verify that the original Dockerfile exists
 ORIG_DOCKERFILE="${SCRIPT_DIR}/${ORIG_DOCKERFILE}"
 if [[ ! -f "${ORIG_DOCKERFILE}" ]]; then
@@ -156,21 +163,6 @@ echo "FINAL_TAG: ${FINAL_TAG}"
 echo "Original Dockerfile: ${ORIG_DOCKERFILE}"
 echo ""
 
-
-DO_PIP_BUILD=0
-if [[ ${TF_DOCKER_BUILD_IS_DEVEL} == "yes" ]]; then
-  # Devel builds has pip build instructions in the Dockerfile
-  :
-else
-  if [[ ! -z ${TF_DOCKER_BUILD_CENTRAL_PIP} ]] &&
-     [[ ${TF_DOCKER_BUILD_CENTRAL_PIP} != "0" ]]; then
-    :
-  else
-    DO_PIP_BUILD=1
-  fi
-fi
-
-
 # Create tmp directory for Docker build
 TMP_DIR=$(mktemp -d)
 echo ""
@@ -179,67 +171,96 @@ echo "Docker build will occur in temporary directory: ${TMP_DIR}"
 # Copy all files to tmp directory for Docker build
 cp -r ${SCRIPT_DIR}/* "${TMP_DIR}/"
 
-
-if [[ "${DO_PIP_BUILD}" == "1" ]]; then
+if [[ "${TF_DOCKER_BUILD_IS_DEVEL}" == "no" ]]; then
   DOCKERFILE="${TMP_DIR}/Dockerfile"
 
-  # Perform local build of the required PIP whl file
-  export TF_BUILD_CONTAINER_TYPE=${TF_DOCKER_BUILD_TYPE}
-  export TF_BUILD_PYTHON_VERSION=${TF_DOCKER_BUILD_PYTHON_VERSION}
-  export TF_BUILD_OPTIONS=${TF_DOCKER_BUILD_OPTIONS}
-  export TF_BUILD_IS_PIP="PIP"
-
-  if [[ "${TF_DOCKER_BUILD_TYPE}" == "gpu" ]]; then
-    export TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS=\
-"${TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS} -e TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2"
-  fi
-
-  pushd "${SCRIPT_DIR}/../../../"
-  rm -rf pip_test/whl &&
-  tensorflow/tools/ci_build/ci_parameterized_build.sh
-  PIP_BUILD_EXIT_CODE=$?
-  popd
-
-  # Was the pip build successful?
-  if [[ ${PIP_BUILD_EXIT_CODE} != "0" ]]; then
-    die "FAIL: Failed to build pip file locally"
-  fi
-
-  PIP_WHL=$(ls pip_test/whl/*.whl | head -1)
-  if [[ -z "${PIP_WHL}" ]]; then
-    die "ERROR: Cannot locate the locally-built pip whl file"
-  fi
-  echo "Locally-built PIP whl file is at: ${PIP_WHL}"
-
-  # Copy the pip file to tmp directory
-  cp "${PIP_WHL}" "${TMP_DIR}/" || \
-      die "ERROR: Failed to copy wheel file: ${PIP_WHL}"
-
-  # Use string replacement to put the correct file name into the Dockerfile
-  PIP_WHL=$(basename "${PIP_WHL}")
-
-  # Modify the non-devel Dockerfile to point to the correct pip whl file
-  # location
-  sed -e "/# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #/,"\
+  if [[ -z "${TF_DOCKER_BUILD_CENTRAL_PIP}" ]]; then
+    # Perform local build of the required PIP whl file
+    export TF_BUILD_CONTAINER_TYPE=${TF_DOCKER_BUILD_TYPE}
+    export TF_BUILD_PYTHON_VERSION=${TF_DOCKER_BUILD_PYTHON_VERSION}
+    export TF_BUILD_OPTIONS=${TF_DOCKER_BUILD_OPTIONS}
+    export TF_BUILD_IS_PIP="PIP"
+
+    if [[ "${TF_DOCKER_BUILD_TYPE}" == "gpu" ]]; then
+      export TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS=\
+  "${TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS} -e TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2"
+    fi
+
+    pushd "${SCRIPT_DIR}/../../../"
+    rm -rf pip_test/whl &&
+    tensorflow/tools/ci_build/ci_parameterized_build.sh
+    PIP_BUILD_EXIT_CODE=$?
+    popd
+
+    # Was the pip build successful?
+    if [[ ${PIP_BUILD_EXIT_CODE} != "0" ]]; then
+      die "FAIL: Failed to build pip file locally"
+    fi
+
+    PIP_WHL=$(ls pip_test/whl/*.whl | head -1)
+    if [[ -z "${PIP_WHL}" ]]; then
+      die "ERROR: Cannot locate the locally-built pip whl file"
+    fi
+    echo "Locally-built PIP whl file is at: ${PIP_WHL}"
+
+    # Copy the pip file to tmp directory
+    cp "${PIP_WHL}" "${TMP_DIR}/" || \
+        die "ERROR: Failed to copy wheel file: ${PIP_WHL}"
+
+    # Use string replacement to put the correct file name into the Dockerfile
+    PIP_WHL=$(basename "${PIP_WHL}")
+
+    # Modify the non-devel Dockerfile to point to the correct pip whl file
+    # location
+    sed -e "/# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #/,"\
 "/# --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #/c"\
 "COPY ${PIP_WHL} /\n"\
 "RUN pip --no-cache-dir install /${PIP_WHL}" "${ORIG_DOCKERFILE}" \
     > "${DOCKERFILE}"
+  else
+    echo "Downloading pip wheel from: ${TF_DOCKER_BUILD_CENTRAL_PIP}"
+    echo
+
+    # Modify the non-devel Dockerfile to point to the correct pip whl URL.
+    sed -e "/# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #/,"\
+"/# --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #/c"\
+"RUN pip --no-cache-dir install ${TF_DOCKER_BUILD_CENTRAL_PIP}" "${ORIG_DOCKERFILE}" \
+    > "${DOCKERFILE}"
+  fi
 
   echo "Modified Dockerfile at: ${DOCKERFILE}"
+  echo
+
+  # Modify python/pip version if necessary.
+  if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3" ]]; then
+    sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
+        sed -i -e 's/python-dev/python3-dev/g' "${DOCKERFILE}" && \
+        sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
+        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}" && \
+        echo "Modified Dockerfile for python version "\
+"${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}" || \
+        die "FAILED to modify ${DOCKERFILE} for python3"
+  fi
 else
-  if [[ "${TF_DOCKER_BUILD_IS_DEVEL}" == "yes" ]]; then
-    DOCKERFILE="${TMP_DIR}/Dockerfile"
+  DOCKERFILE="${TMP_DIR}/Dockerfile"
 
-    # Modify the devel Dockerfile to specify the git branch
-    sed -r "s/([\s]*git checkout )(.*)/\1${TF_DOCKER_BUILD_DEVEL_BRANCH}/g" \
-        "${ORIG_DOCKERFILE}" > "${DOCKERFILE}"
-  else
-    DOCKERFILE="${TMP_DIR}/"$(basename "${ORIG_DOCKERFILE}")
+  # Modify the devel Dockerfile to specify the git branch
+  sed -r "s/([\s]*git checkout )(.*)/\1${TF_DOCKER_BUILD_DEVEL_BRANCH}/g" \
+      "${ORIG_DOCKERFILE}" > "${DOCKERFILE}"
+
+  # Modify python/pip version if necessary.
+  if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3" ]]; then
+    sed -i -e 's/python-dev/python-dev python3-dev/g' "${DOCKERFILE}" && \
+        sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
+        sed -i -e 's^/tmp/pip^/tmp/pip3^g' "${DOCKERFILE}" && \
+        sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
+        sed -i -e 's/ENV CI_BUILD_PYTHON python/ENV CI_BUILD_PYTHON python3/g' "${DOCKERFILE}" && \
+        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}" && \
+        echo "Modified Dockerfile further for python version ${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}" || \
+        die "FAILED to modify ${DOCKERFILE} for python3"
   fi
 fi
 
-
 # Perform docker build
 # Intermediate image name with tag
 IMG="${USER}/tensorflow:${FINAL_TAG}"
diff --git a/tensorflow/tools/gcs_test/Dockerfile b/tensorflow/tools/gcs_test/Dockerfile
index a9db7ce9b0..1df0069272 100644
--- a/tensorflow/tools/gcs_test/Dockerfile
+++ b/tensorflow/tools/gcs_test/Dockerfile
@@ -15,9 +15,9 @@ RUN curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/install_google_clou
 RUN chmod +x install_google_cloud_sdk.bash
 RUN ./install_google_cloud_sdk.bash --disable-prompts --install-dir=/var/gcloud
 
-# Install nightly TensorFlow pip
-RUN pip install \
-   https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0-cp27-none-linux_x86_64.whl
+# Install TensorFlow pip from build context.
+COPY tensorflow-*.whl /
+RUN pip install /tensorflow-*.whl
 
 # Copy test files
 RUN mkdir -p /gcs-smoke/python
diff --git a/tensorflow/tools/gcs_test/gcs_smoke.sh b/tensorflow/tools/gcs_test/gcs_smoke.sh
index 6deff2e919..ec7ee4fbb0 100755
--- a/tensorflow/tools/gcs_test/gcs_smoke.sh
+++ b/tensorflow/tools/gcs_test/gcs_smoke.sh
@@ -17,9 +17,10 @@
 # Driver script for TensorFlow-GCS smoke test.
 #
 # Usage:
-#   gcs_smoke.sh <GCLOUD_JSON_KEY_PATH> <GCS_BUCKET_URL>
+#   gcs_smoke.sh <WHL_URL> <GCLOUD_JSON_KEY_PATH> <GCS_BUCKET_URL>
 #
 # Input arguments:
+#   WHL_URL: URL to the TensorFlow wheel file to use in this test.
 #   GCLOUD_KEY_JSON_PATH: Path to the Google Cloud JSON key file.
 #     See https://cloud.google.com/storage/docs/authentication for details.
 #
@@ -34,13 +35,13 @@ print_usage() {
   echo ""
 }
 
-
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 source "${SCRIPT_DIR}/../ci_build/builds/builds_common.sh"
 
 # Check input arguments
-GCLOUD_JSON_KEY_PATH=$1
-GCS_BUCKET_URL=$2
+WHL_URL=$1
+GCLOUD_JSON_KEY_PATH=$2
+GCS_BUCKET_URL=$3
 if [[ -z "${GCLOUD_JSON_KEY_PATH}" ]]; then
   print_usage
   die "ERROR: Command-line argument GCLOUD_JSON_KEY_PATH is not supplied"
@@ -55,16 +56,36 @@ if [[ ! -f "${GCLOUD_JSON_KEY_PATH}" ]]; then
 "${GCLOUD_JSON_KEY_PATH}\""
 fi
 
-DOCKERFILE="${SCRIPT_DIR}/Dockerfile"
+# Create temporary directory for docker build
+BUILD_DIR=$(mktemp -d)
+echo ""
+echo "Using whl file URL: ${WHL_URL}"
+echo "Building in temporary directory: ${BUILD_DIR}"
+
+cp -r ${SCRIPT_DIR}/* "${BUILD_DIR}"/ || \
+    die "Failed to copy files to ${BUILD_DIR}"
+
+# Download whl file into the build context directory.
+wget -P "${BUILD_DIR}" ${WHL_URL} || \
+    die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
+
+DOCKERFILE="${BUILD_DIR}/Dockerfile"
 if [[ ! -f "${DOCKERFILE}" ]]; then
   die "ERROR: Cannot find Dockerfile at expected path ${DOCKERFILE}"
 fi
 
+# Download whl file into the build context directory.
+wget -P "${BUILD_DIR}" ${WHL_URL} || \
+    die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
+
 # Build the docker image for testing
 docker build --no-cache \
-    -f "${DOCKERFILE}" -t "${DOCKER_IMG}" "${SCRIPT_DIR}" || \
+    -f "${DOCKERFILE}" -t "${DOCKER_IMG}" "${BUILD_DIR}" || \
     die "FAIL: Failed to build docker image for testing"
 
+# Clean up docker build context directory.
+rm -rf "${BUILD_DIR}"
+
 # Run the docker image with the GCS key file mapped and the gcloud-required
 # environment variables set.
 docker run --rm \
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 21f3060479..688681572e 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -26,7 +26,10 @@ from setuptools import find_packages, setup, Command
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
 
-_VERSION = '0.11.0'
+# This version string is semver compatible, but incompatible with pip.
+# For pip, we will remove all '-' characters from this string, and use the
+# result for pip.
+_VERSION = '0.12.0-rc0'
 
 REQUIRED_PACKAGES = [
     'numpy >= 1.11.0',
@@ -161,7 +164,7 @@ headers = (list(find_files('*.h', 'tensorflow/core')) +
 
 setup(
     name=project_name,
-    version=_VERSION,
+    version=_VERSION.replace('-', ''),
     description='TensorFlow helps the tensors flow',
     long_description='',
     url='http://tensorflow.org/',
diff --git a/tensorflow/tools/tfprof/BUILD b/tensorflow/tools/tfprof/BUILD
index 56e1fb7ae4..57cccd8921 100644
--- a/tensorflow/tools/tfprof/BUILD
+++ b/tensorflow/tools/tfprof/BUILD
@@ -45,8 +45,8 @@ tf_proto_library(
         ["**/*.proto"],
     ),
     cc_api_version = 2,
-    cc_libs = ["//tensorflow/core:protos_all_cc"],
     go_api_version = 2,
     java_api_version = 2,
+    protodeps = ["//tensorflow/core:protos_all"],
     visibility = ["//visibility:public"],
 )
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index d3c307a6a0..20920f7fca 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -1,9 +1,5 @@
 licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
 
-load("@local_config_cuda//cuda:platform.bzl", "cuda_library_path")
-load("@local_config_cuda//cuda:platform.bzl", "cuda_static_library_path")
-load("@local_config_cuda//cuda:platform.bzl", "cudnn_library_path")
-load("@local_config_cuda//cuda:platform.bzl", "cupti_library_path")
 load("@local_config_cuda//cuda:platform.bzl", "readlink_command")
 
 package(default_visibility = ["//visibility:public"])
@@ -51,9 +47,7 @@ cc_library(
 
 cc_library(
     name = "cudart_static",
-    srcs = [
-        cuda_static_library_path("cudart"),
-    ],
+    srcs = ["lib/%{cudart_static_lib}"],
     includes = ["include/"],
     linkopts = [
         "-ldl",
@@ -65,12 +59,8 @@ cc_library(
 
 cc_library(
     name = "cudart",
-    srcs = [
-        cuda_library_path("cudart"),
-    ],
-    data = [
-        cuda_library_path("cudart"),
-    ],
+    srcs = ["lib/%{cudart_lib}"],
+    data = ["lib/%{cudart_lib}"],
     includes = ["include/"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
@@ -78,12 +68,8 @@ cc_library(
 
 cc_library(
     name = "cublas",
-    srcs = [
-        cuda_library_path("cublas"),
-    ],
-    data = [
-        cuda_library_path("cublas"),
-    ],
+    srcs = ["lib/%{cublas_lib}"],
+    data = ["lib/%{cublas_lib}"],
     includes = ["include/"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
@@ -91,12 +77,8 @@ cc_library(
 
 cc_library(
     name = "cudnn",
-    srcs = [
-        cudnn_library_path(),
-    ],
-    data = [
-        cudnn_library_path(),
-    ],
+    srcs = ["lib/%{cudnn_lib}"],
+    data = ["lib/%{cudnn_lib}"],
     includes = ["include/"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
@@ -104,12 +86,8 @@ cc_library(
 
 cc_library(
     name = "cufft",
-    srcs = [
-        cuda_library_path("cufft"),
-    ],
-    data = [
-        cuda_library_path("cufft"),
-    ],
+    srcs = ["lib/%{cufft_lib}"],
+    data = ["lib/%{cufft_lib}"],
     includes = ["include/"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
@@ -117,12 +95,8 @@ cc_library(
 
 cc_library(
     name = "curand",
-    srcs = [
-        cuda_library_path("curand"),
-    ],
-    data = [
-        cuda_library_path("curand"),
-    ],
+    srcs = ["lib/%{curand_lib}"],
+    data = ["lib/%{curand_lib}"],
     includes = ["include/"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
@@ -155,9 +129,7 @@ cc_library(
 
 cc_library(
     name = "cupti_dsos",
-    data = [
-        cupti_library_path(),
-    ],
+    data = ["lib/%{cupti_lib}"],
     visibility = ["//visibility:public"],
 )
 
diff --git a/third_party/gpus/cuda/platform.bzl.tpl b/third_party/gpus/cuda/platform.bzl.tpl
index 539ed58d2c..01ef24b94e 100644
--- a/third_party/gpus/cuda/platform.bzl.tpl
+++ b/third_party/gpus/cuda/platform.bzl.tpl
@@ -8,65 +8,6 @@ def cuda_sdk_version():
 def cudnn_sdk_version():
   return CUDNN_VERSION
 
-def cuda_library_path(name, version = cuda_sdk_version()):
-  if PLATFORM == "Darwin":
-    if not version:
-      return "lib/lib{}.dylib".format(name)
-    else:
-      return "lib/lib{}.{}.dylib".format(name, version)
-  elif PLATFORM == "Windows":
-    if not version:
-      return "lib/{}.dll".format(name)
-    else:
-      return "lib/{}{}.dll".format(name, version)
-  else:
-    if not version:
-      return "lib64/lib{}.so".format(name)
-    else:
-      return "lib64/lib{}.so.{}".format(name, version)
-
-def cuda_static_library_path(name):
-  if PLATFORM == "Darwin":
-    return "lib/lib{}_static.a".format(name)
-  elif PLATFORM == "Windows":
-    return "lib/{}_static.lib".format(name)
-  else:
-    return "lib64/lib{}_static.a".format(name)
-
-def cudnn_library_path(version = cudnn_sdk_version()):
-  if PLATFORM == "Darwin":
-    if not version:
-      return "lib/libcudnn.dylib"
-    else:
-      return "lib/libcudnn.{}.dylib".format(version)
-  elif PLATFORM == "Windows":
-    if not version:
-      return "lib/cudnn.dll"
-    else:
-      return "lib/cudnn{}.dll".format(version)
-  else:
-    if not version:
-      return "lib64/libcudnn.so"
-    else:
-      return "lib64/libcudnn.so.{}".format(version)
-
-def cupti_library_path(version = cuda_sdk_version()):
-  if PLATFORM == "Darwin":
-    if not version:
-      return "extras/CUPTI/lib/libcupti.dylib"
-    else:
-      return "extras/CUPTI/lib/libcupti.{}.dylib".format(version)
-  elif PLATFORM == "Windows":
-    if not version:
-      return "extras/CUPTI/lib/cupti.dll"
-    else:
-      return "extras/CUPTI/lib/cupti{}.dll".format(version)
-  else:
-    if not version:
-      return "extras/CUPTI/lib64/libcupti.so"
-    else:
-      return "extras/CUPTI/lib64/libcupti.so.{}".format(version)
-
 def readlink_command():
   if PLATFORM == "Darwin":
     return "greadlink"
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index b6cead8685..06694d902c 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -120,12 +120,11 @@ def _enable_cuda(repository_ctx):
   return False
 
 
-def _cuda_toolkit_path(repository_ctx, cuda_version):
+def _cuda_toolkit_path(repository_ctx):
   """Finds the cuda toolkit directory.
 
   Args:
     repository_ctx: The repository context.
-    cuda_version: The cuda toolkit version.
 
   Returns:
     A speculative real path of the cuda toolkit install directory.
@@ -135,16 +134,6 @@ def _cuda_toolkit_path(repository_ctx, cuda_version):
     cuda_toolkit_path = repository_ctx.os.environ[_CUDA_TOOLKIT_PATH].strip()
   if not repository_ctx.path(cuda_toolkit_path).exists:
     auto_configure_fail("Cannot find cuda toolkit path.")
-
-  if cuda_version:
-    # Handle typical configuration where the real path is
-    # <basedir>/cuda-<version> and the provided path is <basedir>/cuda.
-    version_suffixed = "%s-%s" % (cuda_toolkit_path, cuda_version)
-    if repository_ctx.path(version_suffixed).exists:
-      cuda_toolkit_path = version_suffixed
-  # Returns the non-versioned path if cuda version is not provided or if the
-  # installation does not use a cuda- directory, such as on ArchLinux where
-  # CUDA installs directly to /opt/cuda.
   return str(repository_ctx.path(cuda_toolkit_path).realpath)
 
 
@@ -158,20 +147,104 @@ def _cudnn_install_basedir(repository_ctx):
   return cudnn_install_path
 
 
-def _cuda_version(repository_ctx):
-  """Detects the cuda version."""
+_NVCC_VERSION_PREFIX = "Cuda compilation tools, release "
+
+
+def _cuda_version(repository_ctx, cuda_toolkit_path, cpu_value):
+  """Detects the version of CUDA installed on the system.
+
+  Args:
+    repository_ctx: The repository context.
+    cuda_toolkit_path: The CUDA install directory.
+
+  Returns:
+    String containing the version of CUDA.
+  """
+  # Run nvcc --version and find the line containing the CUDA version.
+  nvcc_path = repository_ctx.path("%s/bin/nvcc%s" %
+                                  (cuda_toolkit_path,
+                                   ".exe" if cpu_value == "Windows" else ""))
+  if not nvcc_path.exists:
+    auto_configure_fail("Cannot find nvcc at %s" % str(nvcc_path))
+  result = repository_ctx.execute([str(nvcc_path), '--version'])
+  if result.stderr:
+    auto_configure_fail("Error running nvcc --version: %s" % result.stderr)
+  lines = result.stdout.splitlines()
+  version_line = lines[len(lines) - 1]
+  if version_line.find(_NVCC_VERSION_PREFIX) == -1:
+    auto_configure_fail(
+        "Could not parse CUDA version from nvcc --version. Got: %s" %
+        result.stdout)
+
+  # Parse the CUDA version from the line containing the CUDA version.
+  prefix_removed = version_line.replace(_NVCC_VERSION_PREFIX, '')
+  parts = prefix_removed.split(",")
+  if len(parts) != 2 or len(parts[0]) == 0:
+    auto_configure_fail(
+        "Could not parse CUDA version from nvcc --version. Got: %s" %
+        result.stdout)
+  version = parts[0].strip()
+
+  # Check whether TF_CUDA_VERSION was set by the user and fail if it does not
+  # match the detected version.
+  environ_version = ""
   if _TF_CUDA_VERSION in repository_ctx.os.environ:
-    return repository_ctx.os.environ[_TF_CUDA_VERSION].strip()
-  else:
-    return ""
+    environ_version = repository_ctx.os.environ[_TF_CUDA_VERSION].strip()
+  if environ_version and version != environ_version:
+    auto_configure_fail(
+        "CUDA version detected from nvcc (%s) does not match " +
+        "TF_CUDA_VERSION (%s)" % (version, environ_version))
+
+  if cpu_value == "Windows":
+    version = "64_" + version.replace(".", "")
+  return version
+
+
+_DEFINE_CUDNN_MAJOR = "#define CUDNN_MAJOR"
+
 
+def _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value):
+  """Detects the version of cuDNN installed on the system.
 
-def _cudnn_version(repository_ctx):
-  """Detects the cudnn version."""
+  Args:
+    repository_ctx: The repository context.
+    cpu_value: The name of the host operating system.
+    cudnn_install_basedir: The cuDNN install directory.
+
+  Returns:
+    A string containing the version of cuDNN.
+  """
+  # Find cudnn.h and grep for the line defining CUDNN_MAJOR.
+  cudnn_h_path = repository_ctx.path("%s/include/cudnn.h" %
+                                     cudnn_install_basedir)
+  if not cudnn_h_path.exists:
+    auto_configure_fail("Cannot find cudnn.h at %s" % str(cudnn_h_path))
+  result = repository_ctx.execute([
+      "grep", "-E", _DEFINE_CUDNN_MAJOR, str(cudnn_h_path)])
+  if result.stderr:
+    auto_configure_fail("Error reading %s: %s" %
+                        (result.stderr, str(cudnn_h_path)))
+
+  # Parse the cuDNN major version from the line defining CUDNN_MAJOR
+  lines = result.stdout.splitlines()
+  if len(lines) == 0 or lines[0].find(_DEFINE_CUDNN_MAJOR) == -1:
+    auto_configure_fail("Cannot find line containing '%s' in %s" %
+                        (_DEFINE_CUDNN_MAJOR, str(cudnn_h_path)))
+  version = lines[0].replace(_DEFINE_CUDNN_MAJOR, "").strip()
+
+  # Check whether TF_CUDNN_VERSION was set by the user and fail if it does not
+  # match the detected version.
+  environ_version = ""
   if _TF_CUDNN_VERSION in repository_ctx.os.environ:
-    return repository_ctx.os.environ[_TF_CUDNN_VERSION].strip()
-  else:
-    return ""
+    environ_version = repository_ctx.os.environ[_TF_CUDNN_VERSION].strip()
+  if environ_version and version != environ_version:
+    auto_configure_fail(
+        ("cuDNN version detected from %s (%s) does not match " +
+        "TF_CUDNN_VERSION (%s)") % (str(cudnn_h_path), version, environ_version))
+
+  if cpu_value == "Windows":
+    version = "64_" + version
+  return version
 
 
 def _compute_capabilities(repository_ctx):
@@ -191,6 +264,14 @@ def _compute_capabilities(repository_ctx):
 
 
 def _cpu_value(repository_ctx):
+  """Returns the name of the host operating system.
+
+  Args:
+    repository_ctx: The repository context.
+
+  Returns:
+    A string containing the name of the host operating system.
+  """
   os_name = repository_ctx.os.name.lower()
   if os_name.startswith("mac os"):
     return "Darwin"
@@ -200,75 +281,163 @@ def _cpu_value(repository_ctx):
   return result.stdout.strip()
 
 
-def _cuda_symlink_files(cpu_value, cuda_version, cudnn_version):
-  """Returns a struct containing platform-specific paths.
+def _lib_name(lib, cpu_value, version="", static=False):
+  """Constructs the platform-specific name of a library.
 
   Args:
-    cpu_value: The string representing the host OS.
-    cuda_version: The cuda version as returned by _cuda_version
-    cudnn_version: The cudnn version as returned by _cudnn_version
+    lib: The name of the library, such as "cudart"
+    cpu_value: The name of the host operating system.
+    version: The version of the library.
+    static: True the library is static or False if it is a shared object.
+
+  Returns:
+    The platform-specific name of the library.
   """
-  cuda_ext = ".%s" % cuda_version if cuda_version else ""
-  cudnn_ext = ".%s" % cudnn_version if cudnn_version else ""
   if cpu_value == "Linux":
-    return struct(
-        cuda_lib_path = "lib64",
-        cuda_rt_lib = "lib64/libcudart.so%s" % cuda_ext,
-        cuda_rt_lib_static = "lib64/libcudart_static.a",
-        cuda_blas_lib = "lib64/libcublas.so%s" % cuda_ext,
-        cuda_dnn_lib = "lib64/libcudnn.so%s" % cudnn_ext,
-        cuda_dnn_lib_alt = "libcudnn.so%s" % cudnn_ext,
-        cuda_rand_lib = "lib64/libcurand.so%s" % cuda_ext,
-        cuda_fft_lib = "lib64/libcufft.so%s" % cuda_ext,
-        cuda_cupti_lib = "extras/CUPTI/lib64/libcupti.so%s" % cuda_ext)
-  elif cpu_value == "Darwin":
-    return struct(
-        cuda_lib_path = "lib",
-        cuda_rt_lib = "lib/libcudart%s.dylib" % cuda_ext,
-        cuda_rt_lib_static = "lib/libcudart_static.a",
-        cuda_blas_lib = "lib/libcublas%s.dylib" % cuda_ext,
-        cuda_dnn_lib = "lib/libcudnn%s.dylib" % cudnn_ext,
-        cuda_dnn_lib_alt = "libcudnn%s.dylib" % cudnn_ext,
-        cuda_rand_lib = "lib/libcurand%s.dylib" % cuda_ext,
-        cuda_fft_lib = "lib/libcufft%s.dylib" % cuda_ext,
-        cuda_cupti_lib = "extras/CUPTI/lib/libcupti%s.dylib" % cuda_ext)
+    if static:
+      return "lib%s.a" % lib
+    else:
+      if version:
+        version = ".%s" % version
+      return "lib%s.so%s" % (lib, version)
   elif cpu_value == "Windows":
-    return struct(
-        cuda_lib_path = "lib",
-        cuda_rt_lib = "lib/cudart%s.dll" % cuda_ext,
-        cuda_rt_lib_static = "lib/cudart_static.lib",
-        cuda_blas_lib = "lib/cublas%s.dll" % cuda_ext,
-        cuda_dnn_lib = "lib/cudnn%s.dll" % cudnn_ext,
-        cuda_dnn_lib_alt = "cudnn%s.dll" % cudnn_ext,
-        cuda_rand_lib = "lib/curand%s.dll" % cuda_ext,
-        cuda_fft_lib = "lib/cufft%s.dll" % cuda_ext,
-        cuda_cupti_lib = "extras/CUPTI/lib/cupti%s.dll" % cuda_ext)
+    return "%s.lib" % lib
+  elif cpu_value == "Darwin":
+    if static:
+      return "lib%s.a" % lib
+    else:
+      if version:
+        version = ".%s" % version
+    return "lib%s%s.dylib" % (lib, version)
   else:
-    auto_configure_fail("Not supported CPU value %s" % cpu_value)
+    auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
 
 
-def _check_lib(repository_ctx, cuda_toolkit_path, cuda_lib):
-  """Checks if cuda_lib exists under cuda_toolkit_path or fail if it doesn't.
+def _find_cuda_lib(lib, repository_ctx, cpu_value, basedir, version="",
+                   static=False):
+  """Finds the given CUDA or cuDNN library on the system.
 
   Args:
+    lib: The name of the library, such as "cudart"
     repository_ctx: The repository context.
-    cuda_toolkit_path: The cuda toolkit directory containing the cuda libraries.
-    cuda_lib: The library to look for under cuda_toolkit_path.
+    cpu_value: The name of the host operating system.
+    basedir: The install directory of CUDA or cuDNN.
+    version: The version of the library.
+    static: True if static library, False if shared object.
+
+  Returns:
+    Returns a struct with the following fields:
+      file_name: The basename of the library found on the system.
+      path: The full path to the library.
   """
-  lib_path = cuda_toolkit_path + "/" + cuda_lib
-  if not repository_ctx.path(lib_path).exists:
-    auto_configure_fail("Cannot find %s" % lib_path)
+  file_name = _lib_name(lib, cpu_value, version, static)
+  if cpu_value == "Linux":
+    path = repository_ctx.path("%s/lib64/%s" % (basedir, file_name))
+    if path.exists:
+      return struct(file_name=file_name, path=str(path.realpath))
+    path = repository_ctx.path(
+        "%s/lib/x86_64-linux-gnu/%s" % (basedir, file_name))
+    if path.exists:
+      return struct(file_name=file_name, path=str(path.realpath))
 
+  elif cpu_value == "Windows":
+    path = repository_ctx.path("%s/lib/x64/%s" % (basedir, file_name))
+    if path.exists:
+      return struct(file_name=file_name, path=str(path.realpath))
+
+  path = repository_ctx.path("%s/lib/%s" % (basedir, file_name))
+  if path.exists:
+    return struct(file_name=file_name, path=str(path.realpath))
+  path = repository_ctx.path("%s/%s" % (basedir, file_name))
+  if path.exists:
+    return struct(file_name=file_name, path=str(path.realpath))
+
+  auto_configure_fail("Cannot find cuda library %s" % file_name)
+
+
+def _find_cupti_lib(repository_ctx, cuda_config):
+  """Finds the cupti library on the system.
 
-def _check_dir(repository_ctx, directory):
-  """Checks whether the directory exists and fail if it does not.
+  On most systems, the cupti library is not installed in the same directory as
+  the other CUDA libraries but rather in a special extras/CUPTI directory.
 
   Args:
     repository_ctx: The repository context.
-    directory: The directory to check the existence of.
+    cuda_config: The cuda configuration as returned by _get_cuda_config.
+
+  Returns:
+    Returns a struct with the following fields:
+      file_name: The basename of the library found on the system.
+      path: The full path to the library.
   """
-  if not repository_ctx.path(directory).exists:
-    auto_configure_fail("Cannot find dir: %s" % directory)
+  file_name = _lib_name("cupti", cuda_config.cpu_value,
+                        cuda_config.cuda_version)
+  if cuda_config.cpu_value == "Linux":
+    path = repository_ctx.path(
+        "%s/extras/CUPTI/lib64/%s" % (cuda_config.cuda_toolkit_path, file_name))
+    if path.exists:
+      return struct(file_name=file_name, path=str(path.realpath))
+
+    path = repository_ctx.path(
+        "%s/lib/x86_64-linux-gnu/%s" % (cuda_config.cuda_toolkit_path,
+                                        file_name))
+    if path.exists:
+      return struct(file_name=file_name, path=str(path.realpath))
+
+  elif cuda_config.cpu_value == "Windows":
+    path = repository_ctx.path(
+        "%s/extras/CUPTI/libx64/%s" %
+        (cuda_config.cuda_toolkit_path, file_name))
+    if path.exists:
+      return struct(file_name=file_name, path=str(path.realpath))
+
+  path = repository_ctx.path(
+      "%s/extras/CUPTI/lib/%s" % (cuda_config.cuda_toolkit_path, file_name))
+  if path.exists:
+    return struct(file_name=file_name, path=str(path.realpath))
+
+  path = repository_ctx.path(
+      "%s/lib/%s" % (cuda_config.cuda_toolkit_path, file_name))
+  if path.exists:
+    return struct(file_name=file_name, path=str(path.realpath))
+
+  auto_configure_fail("Cannot find cupti library %s" % file_name)
+
+def _find_libs(repository_ctx, cuda_config):
+  """Returns the CUDA and cuDNN libraries on the system.
+
+  Args:
+    repository_ctx: The repository context.
+    cuda_config: The CUDA config as returned by _get_cuda_config
+
+  Returns:
+    Map of library names to structs of filename and path as returned by
+    _find_cuda_lib and _find_cupti_lib.
+  """
+  cudnn_version = cuda_config.cudnn_version
+  cudnn_ext = ".%s" % cudnn_version if cudnn_version else ""
+  cpu_value = cuda_config.cpu_value
+  return {
+      "cudart": _find_cuda_lib(
+          "cudart", repository_ctx, cpu_value, cuda_config.cuda_toolkit_path,
+          cuda_config.cuda_version),
+      "cudart_static": _find_cuda_lib(
+          "cudart_static", repository_ctx, cpu_value,
+          cuda_config.cuda_toolkit_path, cuda_config.cuda_version, static=True),
+      "cublas": _find_cuda_lib(
+          "cublas", repository_ctx, cpu_value, cuda_config.cuda_toolkit_path,
+          cuda_config.cuda_version),
+      "curand": _find_cuda_lib(
+          "curand", repository_ctx, cpu_value, cuda_config.cuda_toolkit_path,
+          cuda_config.cuda_version),
+      "cufft": _find_cuda_lib(
+          "cufft", repository_ctx, cpu_value, cuda_config.cuda_toolkit_path,
+          cuda_config.cuda_version),
+      "cudnn": _find_cuda_lib(
+          "cudnn", repository_ctx, cpu_value, cuda_config.cudnn_install_basedir,
+          cuda_config.cudnn_version),
+      "cupti": _find_cupti_lib(repository_ctx, cuda_config),
+  }
 
 
 def _find_cudnn_header_dir(repository_ctx, cudnn_install_basedir):
@@ -319,6 +488,34 @@ def _cudart_static_linkopt(cpu_value):
   """Returns additional platform-specific linkopts for cudart."""
   return "" if cpu_value == "Darwin" else "\"-lrt\","
 
+def _get_cuda_config(repository_ctx):
+  """Detects and returns information about the CUDA installation on the system.
+
+  Args:
+    repository_ctx: The repository context.
+
+  Returns:
+    A struct containing the following fields:
+      cuda_toolkit_path: The CUDA toolkit installation directory.
+      cudnn_install_basedir: The cuDNN installation directory.
+      cuda_version: The version of CUDA on the system.
+      cudnn_version: The version of cuDNN on the system.
+      compute_capabilities: A list of the system's CUDA compute capabilities.
+      cpu_value: The name of the host operating system.
+  """
+  cpu_value = _cpu_value(repository_ctx)
+  cuda_toolkit_path = _cuda_toolkit_path(repository_ctx)
+  cuda_version = _cuda_version(repository_ctx, cuda_toolkit_path, cpu_value)
+  cudnn_install_basedir = _cudnn_install_basedir(repository_ctx)
+  cudnn_version = _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value)
+  return struct(
+      cuda_toolkit_path = cuda_toolkit_path,
+      cudnn_install_basedir = cudnn_install_basedir,
+      cuda_version = cuda_version,
+      cudnn_version = cudnn_version,
+      compute_capabilities = _compute_capabilities(repository_ctx),
+      cpu_value = cpu_value)
+
 
 def _tpl(repository_ctx, tpl, substitutions={}, out=None):
   if not out:
@@ -365,8 +562,6 @@ error_gpu_disabled()
 
 def _create_dummy_repository(repository_ctx):
   cpu_value = _cpu_value(repository_ctx)
-  symlink_files = _cuda_symlink_files(cpu_value, _DEFAULT_CUDA_VERSION,
-                                      _DEFAULT_CUDNN_VERSION)
 
   # Set up BUILD file for cuda/.
   _tpl(repository_ctx, "cuda:build_defs.bzl",
@@ -377,6 +572,18 @@ def _create_dummy_repository(repository_ctx):
        {
            "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
        })
+  _tpl(repository_ctx, "cuda:BUILD",
+       {
+           "%{cudart_static_lib}": _lib_name("cudart_static", cpu_value,
+                                             static=True),
+           "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
+           "%{cudart_lib}": _lib_name("cudart", cpu_value),
+           "%{cublas_lib}": _lib_name("cublas", cpu_value),
+           "%{cudnn_lib}": _lib_name("cudnn", cpu_value),
+           "%{cufft_lib}": _lib_name("cufft", cpu_value),
+           "%{curand_lib}": _lib_name("curand", cpu_value),
+           "%{cupti_lib}": _lib_name("cupti", cpu_value),
+       })
   _tpl(repository_ctx, "cuda:platform.bzl",
        {
            "%{cuda_version}": _DEFAULT_CUDA_VERSION,
@@ -390,13 +597,13 @@ def _create_dummy_repository(repository_ctx):
   repository_ctx.file("cuda/include/cublas.h", "")
   repository_ctx.file("cuda/include/cudnn.h", "")
   repository_ctx.file("cuda/extras/CUPTI/include/cupti.h", "")
-  repository_ctx.file("cuda/%s" % symlink_files.cuda_rt_lib, "")
-  repository_ctx.file("cuda/%s" % symlink_files.cuda_rt_lib_static, "")
-  repository_ctx.file("cuda/%s" % symlink_files.cuda_blas_lib, "")
-  repository_ctx.file("cuda/%s" % symlink_files.cuda_dnn_lib, "")
-  repository_ctx.file("cuda/%s" % symlink_files.cuda_rand_lib, "")
-  repository_ctx.file("cuda/%s" % symlink_files.cuda_fft_lib, "")
-  repository_ctx.file("cuda/%s" % symlink_files.cuda_cupti_lib, "")
+  repository_ctx.file("cuda/lib/%s" % _lib_name("cudart", cpu_value))
+  repository_ctx.file("cuda/lib/%s" % _lib_name("cudart_static", cpu_value))
+  repository_ctx.file("cuda/lib/%s" % _lib_name("cublas", cpu_value))
+  repository_ctx.file("cuda/lib/%s" % _lib_name("cudnn", cpu_value))
+  repository_ctx.file("cuda/lib/%s" % _lib_name("curand", cpu_value))
+  repository_ctx.file("cuda/lib/%s" % _lib_name("cufft", cpu_value))
+  repository_ctx.file("cuda/lib/%s" % _lib_name("cupti", cpu_value))
 
   # Set up cuda_config.h, which is used by
   # tensorflow/stream_executor/dso_loader.cc.
@@ -431,44 +638,30 @@ def _symlink_dir(repository_ctx, src_dir, dest_dir):
 
 def _create_cuda_repository(repository_ctx):
   """Creates the repository containing files set up to build with CUDA."""
-  cuda_version = _cuda_version(repository_ctx)
-  cuda_toolkit_path = _cuda_toolkit_path(repository_ctx, cuda_version)
-  cudnn_install_basedir = _cudnn_install_basedir(repository_ctx)
-  cudnn_version = _cudnn_version(repository_ctx)
-  compute_capabilities = _compute_capabilities(repository_ctx)
-
-  cpu_value = _cpu_value(repository_ctx)
-  symlink_files = _cuda_symlink_files(cpu_value, cuda_version, cudnn_version)
-  _check_lib(repository_ctx, cuda_toolkit_path, symlink_files.cuda_rt_lib)
-  _check_lib(repository_ctx, cuda_toolkit_path, symlink_files.cuda_cupti_lib)
-  _check_dir(repository_ctx, cudnn_install_basedir)
+  cuda_config = _get_cuda_config(repository_ctx)
 
   cudnn_header_dir = _find_cudnn_header_dir(repository_ctx,
-                                            cudnn_install_basedir)
-  cudnn_lib_path = _find_cudnn_lib_path(repository_ctx, cudnn_install_basedir,
-                                        symlink_files)
+                                            cuda_config.cudnn_install_basedir)
 
   # Set up symbolic links for the cuda toolkit. We link at the individual file
   # level not at the directory level. This is because the external library may
   # have a different file layout from our desired structure.
+  cuda_toolkit_path = cuda_config.cuda_toolkit_path
   _symlink_dir(repository_ctx, cuda_toolkit_path + "/include", "cuda/include")
-  _symlink_dir(repository_ctx,
-               cuda_toolkit_path + "/" + symlink_files.cuda_lib_path,
-               "cuda/" + symlink_files.cuda_lib_path)
   _symlink_dir(repository_ctx, cuda_toolkit_path + "/bin", "cuda/bin")
   _symlink_dir(repository_ctx, cuda_toolkit_path + "/nvvm", "cuda/nvvm")
   _symlink_dir(repository_ctx, cuda_toolkit_path + "/extras/CUPTI/include",
                "cuda/extras/CUPTI/include")
-  repository_ctx.symlink(cuda_toolkit_path + "/" + symlink_files.cuda_cupti_lib,
-                         "cuda/" + symlink_files.cuda_cupti_lib)
+
+  cuda_libs = _find_libs(repository_ctx, cuda_config)
+  for lib in cuda_libs.values():
+    repository_ctx.symlink(lib.path, "cuda/lib/" + lib.file_name)
 
   # Set up the symbolic links for cudnn if cudnn was was not installed to
   # CUDA_TOOLKIT_PATH.
   if not repository_ctx.path("cuda/include/cudnn.h").exists:
     repository_ctx.symlink(cudnn_header_dir + "/cudnn.h",
                            "cuda/include/cudnn.h")
-  if not repository_ctx.path("cuda/" + symlink_files.cuda_dnn_lib).exists:
-    repository_ctx.symlink(cudnn_lib_path, "cuda/" + symlink_files.cuda_dnn_lib)
 
   # Set up BUILD file for cuda/
   _tpl(repository_ctx, "cuda:build_defs.bzl",
@@ -477,13 +670,22 @@ def _create_cuda_repository(repository_ctx):
        })
   _tpl(repository_ctx, "cuda:BUILD",
        {
-           "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
+           "%{cudart_static_lib}": cuda_libs["cudart_static"].file_name,
+           "%{cudart_static_linkopt}": _cudart_static_linkopt(
+               cuda_config.cpu_value),
+           "%{cudart_lib}": cuda_libs["cudart"].file_name,
+           "%{cublas_lib}": cuda_libs["cublas"].file_name,
+           "%{cudnn_lib}": cuda_libs["cudnn"].file_name,
+           "%{cufft_lib}": cuda_libs["cufft"].file_name,
+           "%{curand_lib}": cuda_libs["curand"].file_name,
+           "%{cupti_lib}": cuda_libs["cupti"].file_name,
        })
+
   _tpl(repository_ctx, "cuda:platform.bzl",
        {
-           "%{cuda_version}": cuda_version,
-           "%{cudnn_version}": cudnn_version,
-           "%{platform}": cpu_value,
+           "%{cuda_version}": cuda_config.cuda_version,
+           "%{cudnn_version}": cuda_config.cudnn_version,
+           "%{platform}": cuda_config.cpu_value,
        })
 
   # Set up crosstool/
@@ -492,7 +694,7 @@ def _create_cuda_repository(repository_ctx):
   gcc_host_compiler_includes = _gcc_host_compiler_includes(repository_ctx, cc)
   _tpl(repository_ctx, "crosstool:CROSSTOOL",
        {
-           "%{cuda_include_path}": cuda_toolkit_path + '/include',
+           "%{cuda_include_path}": cuda_config.cuda_toolkit_path + '/include',
            "%{gcc_host_compiler_includes}": gcc_host_compiler_includes,
        })
   _tpl(repository_ctx,
@@ -501,17 +703,18 @@ def _create_cuda_repository(repository_ctx):
            "%{cpu_compiler}": str(cc),
            "%{gcc_host_compiler_path}": str(cc),
            "%{cuda_compute_capabilities}": ", ".join(
-               ["\"%s\"" % c for c in compute_capabilities]),
+               ["\"%s\"" % c for c in cuda_config.compute_capabilities]),
        })
 
   # Set up cuda_config.h, which is used by
   # tensorflow/stream_executor/dso_loader.cc.
   _tpl(repository_ctx, "cuda:cuda_config.h",
        {
-           "%{cuda_version}": cuda_version,
-           "%{cudnn_version}": cudnn_version,
+           "%{cuda_version}": cuda_config.cuda_version,
+           "%{cudnn_version}": cuda_config.cudnn_version,
            "%{cuda_compute_capabilities}": ",".join(
-               ["CudaVersion(\"%s\")" % c for c in compute_capabilities]),
+               ["CudaVersion(\"%s\")" % c
+                for c in cuda_config.compute_capabilities]),
        })
 
 
diff --git a/third_party/sycl/crosstool/CROSSTOOL.tpl b/third_party/sycl/crosstool/CROSSTOOL.tpl
index 2108a5b9f0..d767b8ca4a 100755
--- a/third_party/sycl/crosstool/CROSSTOOL.tpl
+++ b/third_party/sycl/crosstool/CROSSTOOL.tpl
@@ -72,6 +72,9 @@ toolchain {
   # All warnings are enabled. Maybe enable -Werror as well?
   compiler_flag: "-Wall"
 
+  # Enable SSE instructions by default
+  compiler_flag: "-msse3"
+
   # Anticipated future default.
   linker_flag: "-Wl,-no-as-needed"
   # Stamp the binary with a unique identifier.
@@ -79,4 +82,22 @@ toolchain {
   linker_flag: "-Wl,--hash-style=gnu"
 
   linking_mode_flags { mode: DYNAMIC }
+
+  compilation_mode_flags {
+    mode: FASTBUILD
+    compiler_flag: "-O0"
+  }
+
+  compilation_mode_flags {
+    mode: DBG
+    compiler_flag: "-g"
+  }
+
+  compilation_mode_flags {
+    mode: OPT
+    compiler_flag: "-g0"
+    compiler_flag: "-O2"
+    compiler_flag: "-DNDEBUG"
+  }
 }
+
diff --git a/third_party/sycl/crosstool/computecpp.tpl b/third_party/sycl/crosstool/computecpp.tpl
index d319a1eb75..0c7611d298 100755
--- a/third_party/sycl/crosstool/computecpp.tpl
+++ b/third_party/sycl/crosstool/computecpp.tpl
@@ -1,61 +1,57 @@
 #!/usr/bin/env python
 
-from argparse import ArgumentParser
 import os
 import subprocess
-import re
 import sys
-import pipes
 
 CPU_CXX_COMPILER = ('%{host_cxx_compiler}')
 CPU_C_COMPILER = ('%{host_c_compiler}')
 
 CURRENT_DIR = os.path.dirname(sys.argv[0])
-COMPUTECPP_ROOT = CURRENT_DIR +"/../sycl/"
-COMPUTECPP_DRIVER= COMPUTECPP_ROOT+"bin/compute++"
-COMPUTECPP_INCLUDE = COMPUTECPP_ROOT+"include"
+COMPUTECPP_ROOT = CURRENT_DIR + '/../sycl/'
+COMPUTECPP_DRIVER= COMPUTECPP_ROOT + 'bin/compute++'
+COMPUTECPP_INCLUDE = COMPUTECPP_ROOT + 'include'
 
 def main():
-  computecpp_compiler_flags = [""]
   computecpp_compiler_flags = [flag for flag in sys.argv[1:]]
-  computecpp_compiler_flags = computecpp_compiler_flags + ["-D_GLIBCXX_USE_CXX11_ABI=0"]
 
-  output_file_index = computecpp_compiler_flags.index("-o") +1
+  output_file_index = computecpp_compiler_flags.index('-o') + 1
   output_file_name = computecpp_compiler_flags[output_file_index]
 
   if(output_file_index == 1):
     # we are linking
-    return subprocess.call([CPU_CXX_COMPILER] +computecpp_compiler_flags )
+    return subprocess.call([CPU_CXX_COMPILER] + computecpp_compiler_flags)
+
+  computecpp_compiler_flags = computecpp_compiler_flags + ['-D_GLIBCXX_USE_CXX11_ABI=0']
 
   # find what we compile
   compiling_cpp = 0
-  if("-c" in computecpp_compiler_flags):
-      compiled_file_index = computecpp_compiler_flags.index("-c") +1
+  if('-c' in computecpp_compiler_flags):
+      compiled_file_index = computecpp_compiler_flags.index('-c') + 1
       compited_file_name = computecpp_compiler_flags[compiled_file_index]
       if(compited_file_name.endswith(('.cc', '.c++', '.cpp', '.CPP', '.C', '.cxx'))):
           compiling_cpp = 1;
 
   if(compiling_cpp == 1):
       filename, file_extension = os.path.splitext(output_file_name)
-      bc_out = filename + ".sycl"
+      bc_out = filename + '.sycl'
 
-      computecpp_compiler_flags = ['-sycl-compress-name', '-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable','-I', COMPUTECPP_INCLUDE,'-isystem',
-      COMPUTECPP_INCLUDE, "-std=c++11", "-sycl", "-emit-llvm", "-no-serial-memop"] + computecpp_compiler_flags
+      computecpp_compiler_flags = ['--sycl-no-diags', '-sycl-compress-name', '-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, '-isystem',
+      COMPUTECPP_INCLUDE, '-std=c++11', '-sycl', '-emit-llvm', '-no-serial-memop'] + computecpp_compiler_flags
 
       # dont want that in case of compiling with computecpp first
-      host_compiler_flags = [""]
       host_compiler_flags = [flag for flag in sys.argv[1:]
-                                if not flag.startswith(('-MF','-MD',))
-                                if not ".d" in flag]
+                                if not flag.startswith(('-MF', '-MD',))
+                                if not '.d' in flag]
 
-      x = subprocess.call([COMPUTECPP_DRIVER] +computecpp_compiler_flags )
+      x = subprocess.call([COMPUTECPP_DRIVER] + computecpp_compiler_flags )
       if(x == 0):
-          host_compiler_flags = ['-D_GLIBCXX_USE_CXX11_ABI=0', '-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, "--include",bc_out] + host_compiler_flags
-          return subprocess.call([CPU_CXX_COMPILER] +host_compiler_flags )
+          host_compiler_flags = ['-D_GLIBCXX_USE_CXX11_ABI=0', '-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, '--include', bc_out] + host_compiler_flags
+          x = subprocess.call([CPU_CXX_COMPILER] + host_compiler_flags)
       return x
   else:
     # compile for C
-    return subprocess.call([CPU_C_COMPILER] +computecpp_compiler_flags)
+    return subprocess.call([CPU_C_COMPILER] + computecpp_compiler_flags)
 
 if __name__ == '__main__':
   sys.exit(main())
diff --git a/tools/bazel.rc.template b/tools/bazel.rc.template
index 7a5c2aac80..8f99bf02fa 100644
--- a/tools/bazel.rc.template
+++ b/tools/bazel.rc.template
@@ -1,5 +1,6 @@
 build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true
+build:win-cuda --define=using_cuda=true --define=using_cuda_nvcc=true
 
 build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
 build:sycl --define=using_sycl=true
diff --git a/zlib.BUILD b/zlib.BUILD
index 4c443dfe6e..16d61c0a0c 100644
--- a/zlib.BUILD
+++ b/zlib.BUILD
@@ -32,6 +32,6 @@ cc_library(
         "zutil.h",
     ],
     hdrs = ["zlib.h"],
-    copts = ["-Wno-implicit-function-declaration"],
+    copts = ["-Wno-shift-negative-value", "-Wno-implicit-function-declaration"],
     includes = ["."],
 )