Merge changes from github.

Change: 152200430
author: A. Unique TensorFlower <gardener@tensorflow.org> 2017-04-04 16:10:08 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-04-04 17:24:57 -0700
commit: ccbc8991db3943ef984405881a1c917c530f902f (patch)
tree: a7b5c760155bfa4ff95ffc0ebd3823c649668997
parent: 9477900946f923cb43ed76ed215490d01474bfe7 (diff)
147 files changed, 10938 insertions, 3340 deletions
diff --git a/.gitignore b/.gitignore
index 01f06be1a9..900e5a53cb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 .ipynb_checkpoints
 node_modules
 /.bazelrc
+/.tf_configure.bazelrc
 /bazel-*
 /third_party/py/numpy/numpy_include
 /tools/bazel.rc
diff --git a/README.md b/README.md
index 84c42aad18..d9f05a67e0 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 <div align="center">
   <img src="https://www.tensorflow.org/images/tf_logo_transp.png"><br><br>
 </div>
+
 -----------------
 
 | **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
@@ -33,12 +34,12 @@ and discussion.**
 
 People who are a little more adventurous can also try our nightly binaries:
 
-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.1-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.1-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.1-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.1-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.1-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.1-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
-* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.1-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.1-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
-* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.1-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.1-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
-* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.0.1-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/))
-* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.0.1-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/))
+* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
+* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
+* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
+* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
+* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.1.0rc0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/))
+* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.1.0rc0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/))
 * Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
 ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/))
 
@@ -59,11 +60,11 @@ Hello, TensorFlow!
 >>>
 ```
 
-##For more information
+## For more information
 
 * [TensorFlow website](http://tensorflow.org)
 * [TensorFlow whitepaper](http://download.tensorflow.org/paper/whitepaper2015.pdf)
 * [TensorFlow Model Zoo](https://github.com/tensorflow/models)
 * [TensorFlow MOOC on Udacity](https://www.udacity.com/course/deep-learning--ud730)
 
-The TensorFlow community has created amazing things with TensorFlow, please see the [resources section of tensorflow.org](https://www.tensorflow.org/versions/master/resources#community) for an incomplete list.
+The TensorFlow community has created amazing things with TensorFlow, please see the [resources section of tensorflow.org](https://www.tensorflow.org/about/#community) for an incomplete list.
diff --git a/RELEASE.md b/RELEASE.md
index 5f261a4543..156cc2e3af 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,110 @@
+# Release 1.1.0
+
+## Major Features and Improvements
+* Added Java API support for Windows.
+* Added `tf.spectral` module. Moved existing FFT ops to `tf.spectral` while
+  keeping an alias in the old location (`tf.*`).
+* Added 1D, 2D and 3D Fourier transform ops for real signals to `tf.spectral`.
+* Added a `tf.bincount` function.
+* Added Keras 2 API to contrib.
+* Added a new lightweight queue-like object - `RecordInput`.
+* Added `tf.contrib.image.compose_transforms` function.
+* Bring `tf.estimator.*` into the API. Non-deprecated functionality from `tf.contrib.learn.Estimator` is moved to `tf.estimator.Estimator` with cosmetic changes.
+* Docker images: TF images on gcr.io and Docker Hub are upgraded to ubuntu:16.04.
+* Added the following features to TensorFlow Debugger (tfdbg):
+  * Ability to inspect Python source file against TF ops and tensors (command `print_source` / `ps`)
+  * New navigation bar in Curses-based UI
+  * NodeStepper (command `invoke_stepper`) now uses intermediate tensor dumps. It also uses `TensorHandles` as direct feeds during successive `cont` calls for improved performance and reduced memory consumption.
+
+## Deprecations
+
+* TensorFlow 1.1.0 will be the last time we release a binary with Mac GPU support. Going forward, we will stop testing on Mac GPU systems. We continue to welcome patches that maintain Mac GPU support, and we will try to keep the Mac GPU build working.
+
+## Changes to contrib APIs
+* The behavior of RNNCells is now stricter due to the transition towards making RNNCells act more like Keras layers.
+  * If an RNNCell is used twice in two different variable scopes, an error is raised describing how to avoid this behavior.
+  * If an RNNCell is used in a variable scope with existing conflicting variables, an error is raised showing that the RNNCell must be constructed with argument `reuse=True`.
+* Deprecated contrib/distributions `pmf`, `pdf`, `log_pmf`, `log_pdf`.
+* Moved `bayesflow.special_math` to distributions.
+* `tf.contrib.tensor_forest.python.tensor_forest.RandomForestDeviceAssigner` removed.
+* Changed some MVN classes and parameters:
+  * `tf.contrib.distributions.MultivariateNormalFull` replaced by `tf.contrib.distributions.MultivariateNormalTriL`.
+  * `tf.contrib.distributions.MultivariateNormalCholesky` replaced by `tf.contrib.distributions.MultivariateNormalTriL`
+  * `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev` replaced
+    by `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusScale`
+  * `tf.contrib.distributions.MultivariateNormalDiag` arguments changed from `mu`, `diag_stddev` to `log`, `scale_diag`.
+  * `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT` removed.
+  * `tf.contrib.distributions.MultivariateNormalDiagPlusLowRank` added.
+
+## Bug Fixes and Other Changes
+* Java: Support for loading models exported using the SavedModel API (courtesy @EronWright).
+* Go: Added support for incremental graph execution.
+* Fix a bug in the WALS solver when single-threaded.
+* Added support for integer sparse feature values in `tf.contrib.layers.sparse_column_with_keys`.
+* Fixed `tf.set_random_seed(0)` to be deterministic for all ops.
+* Stability improvements for the GCS file system support.
+* Improved TensorForest performance.
+* Added support for multiple filename globs in `tf.matching_files`.
+* `LogMessage` now includes a timestamp as beginning of a message.
+* Added MultiBox person detector example standalone binary.
+* Android demo: Makefile build functionality added to build.gradle to fully support building TensorFlow demo in Android on Windows.
+* Android demo: read MultiBox priors from txt file rather than protobuf.
+* Added colocation constraints to `StagingArea`.
+* `sparse_matmul_op` reenabled for Android builds.
+* Restrict weights rank to be the same as the broadcast target, to avoid ambiguity on broadcast rules.
+* Upgraded libxsmm to 1.7.1 and applied other changes for performance and memory usage.
+* Fixed bfloat16 integration of LIBXSMM sparse mat-mul.
+* Improved performance and reduce memory usage by allowing ops to forward input buffers to output buffers and perform computations in-place.
+* Improved the performance of CPU assignment for strings.
+* Speed up matrix * vector multiplication and matrix * matrix with unknown shapes.
+* C API: Graph imports now support input remapping, control dependencies, and returning imported nodes (see `TF_GraphImportGraphDefWithReturnOutputs()`)
+* Multiple C++ API updates.
+* Multiple TensorBoard updates including:
+  * Users can now view image summaries at various sampled steps (instead of just the last step).
+  * Bugs involving switching runs as well as the image dashboard are fixed.
+  * Removed data download links from TensorBoard.
+  * TensorBoard uses a relative data directory, for easier embedding.
+  * TensorBoard automatically ignores outliers for domain calculation, and formats proportional values consistently.
+* Multiple tfdbg bug fixes:
+  * Fixed Windows compatibility issues.
+  * Command history now persists across runs.
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+A. Besir Kurtulmus, Adal Chiriliuc, @akash, Alec-Desouza, Alex Rothberg, Alex
+Sergeev, Alexander Heinecke, Allen Guo, Andreas Madsen, Ankesh Anand, Anton
+Loss, @Aravind, @Arie, Ashutosh Das, AuréLien Geron, Bairen Yi, @bakunyo, Ben
+Visser, Brady Zhou, Calpa Liu, Changming Sun, Chi Zeng, Chih Cheng Liang,
+Christopher Berner, Clark Zinzow, @Conchylicultor, Courtial Florian, Dan Ellis,
+Dan J, Dan Jarvis, Daniel Ylitalo, Darren Garvey, David Norman, David Truong,
+@DavidNorman, Dimitar Pavlov, Dmitry Persiyanov, @Eddie, @elirex, Erfan
+Noury, Eron Wright, Evgeny Mazovetskiy, Fabrizio (Misto) Milo, @fanlu, Fisher
+Coder, Franck Dernoncourt, Gagan Goel, Gao, Xiang, @Gautam, Gefu Tang,
+@guilherme, @guschmue, Hannah Provenza, Hans Pabst, @hartb, Hsiao Yi, Huazuo
+Gao, Igor ChorążEwicz, Ivan Smirnov, Jakub Kolodziejczyk, Jason Gavris, Jason
+Morton, Jay Young, Jayaram Bobba, Jeremy Sawruk, Jiaming Liu, Jihun Choi,
+@jiqiu, Joan Thibault, John C F, Jojy G Varghese, Jon Malmaud, Julian Berman,
+Julian Niedermeier, Junpeng Lao, Kai Sasaki, @Kankroc, Karl Lessard, Kyle
+Bostelmann, @Lezcano, Li Yi, Luo Yun, @lurker, Mahmoud-Abuzaina, Mandeep Singh,
+Marek Kolodziej, Mark Szepieniec, Martial Hue, Medhat Omr, Memo Akten, Michael
+Gharbi, MichaëL Defferrard, Milan Straka, @MircoT, @mlucool, Muammar Ibn Faisal,
+Nayana Thorat, @nghiattran, Nicholas Connor, Nikolaas Steenbergen, Niraj Patel,
+Niranjan Hasabnis, @Panmari, Pavel Bulanov, Philip Pries Henningsen, Philipp
+Jund, @polonez, Prayag Verma, Rahul Kavi, Raphael Gontijo Lopes, @rasbt, Raven
+Iqqe, Reid Pryzant, Richard Shin, Rizwan Asif, Russell Kaplan, Ryo Asakura,
+RüDiger Busche, Saisai Shao, Sam Abrahams, @sanosay, Sean Papay, @seaotterman,
+@selay01, Shaurya Sharma, Sriram Narayanamoorthy, Stefano Probst, @taknevski,
+@tbonza, @teldridge11, Yuan (Terry) Tang, Tim Anglade, Tomas Reimers, Tomer Gafner,
+Valentin Iovene, Vamsi Sripathi, Viktor Malyi, Vit Stepanovs, Vivek Rane, Vlad
+Firoiu, @wangg12, @will, Xiaoyu Tao, Yaroslav Bulatov, Yuan (Terry) Tang,
+@Yufeng, Yuming Wang, Yuxin Wu, Zafar Takhirov, Ziming Dong
+
+We are also grateful to all who filed issues or helped resolve them, asked and
+answered questions, and were part of inspiring discussions.
+
+
 # Release 1.0.1
 
 ## Bug Fixes and Other Changes
@@ -94,7 +201,7 @@ To help you upgrade your existing TensorFlow Python code to match the API change
 * In the C++ API (in tensorflow/cc), Input, Output, etc. have moved
   from the tensorflow::ops namespace to tensorflow.
 * Change arg order for `{softmax,sparse_softmax,sigmoid}_cross_entropy_with_logits` to be (labels, predictions), and force use of named args.
-* tf.nn.rnn_cell.* and most functions in tf.nn.rnn.* (with the exception of dynamic_rnn and raw_rnn) are temporarily in tf.contrib.rnn.  They will be moved back into core for TF 1.1.
+* tf.nn.rnn_cell.* and most functions in tf.nn.rnn.* (with the exception of dynamic_rnn and raw_rnn) are temporarily in tf.contrib.rnn.  They will be moved back into core for TF 1.2.
 * `tf.nn.sampled_softmax_loss` and `tf.nn.nce_loss` have both changed their API such that you need to switch the `inputs, labels` to `labels, inputs` parameters.
 * The shape keyword argument of the `SparseTensor` constructor changes its name to `dense_shape` between Tensorflow 0.12 and Tensorflow 1.0.
 
diff --git a/WORKSPACE b/WORKSPACE
index 6ec1a7df3e..cab8389a55 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -20,7 +20,9 @@ load("//tensorflow:workspace.bzl", "tf_workspace")
 #android_sdk_repository(
 #    name = "androidsdk",
 #    api_level = 23,
-#    build_tools_version = "25.0.1",
+#    # Ensure that you have the build_tools_version below installed in the 
+#    # SDK manager as it updates periodically.
+#    build_tools_version = "25.0.2",
 #    # Replace with path to Android SDK on your system
 #    path = "<PATH_TO_SDK>",
 #)
@@ -29,7 +31,9 @@ load("//tensorflow:workspace.bzl", "tf_workspace")
 #android_ndk_repository(
 #    name="androidndk",
 #    path="<PATH_TO_NDK>",
-#    api_level=14) # This needs to be 14 or higher to compile TensorFlow.
+#    # This needs to be 14 or higher to compile TensorFlow. 
+#    # Note that the NDK version is not the API level.
+#    api_level=14)
 
 # Please add all new TensorFlow dependencies in workspace.bzl.
 tf_workspace()
diff --git a/configure b/configure
index e59ee2a925..6360641be2 100755
--- a/configure
+++ b/configure
@@ -8,9 +8,6 @@ pushd `dirname $0` > /dev/null
 SOURCE_BASE_DIR=`pwd -P`
 popd > /dev/null
 
-# This file contains customized config settings.
-touch .bazelrc
-
 PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"
 
 function is_linux() {
@@ -38,14 +35,6 @@ function is_windows() {
   fi
 }
 
-function bazel_fetch() {
-  if [ -z "$TF_BAZEL_TARGETS" ]; then
-    bazel fetch "//tensorflow/... -//tensorflow/contrib/nccl/... -//tensorflow/examples/android/..."
-  else
-    bazel fetch $TF_BAZEL_TARGETS
-  fi
-}
-
 function sed_hyphen_i() {
   if is_macos; then
     sed -i '' "$@"
@@ -54,6 +43,21 @@ function sed_hyphen_i() {
   fi
 }
 
+function write_to_bazelrc() {
+  echo "$1" >> .tf_configure.bazelrc
+}
+
+function write_action_env_to_bazelrc() {
+  write_to_bazelrc "build --action_env $1=\"$2\""
+}
+
+# This file contains customized config settings.
+rm -f .tf_configure.bazelrc
+touch .tf_configure.bazelrc
+touch .bazelrc
+sed_hyphen_i "/tf_configure/d" .bazelrc
+echo "import .tf_configure.bazelrc" >> .bazelrc
+
 # Delete any leftover BUILD files from the Makefile build, which would interfere
 # with Bazel parsing.
 MAKEFILE_DOWNLOAD_DIR=tensorflow/contrib/makefile/downloads
@@ -164,6 +168,7 @@ if is_windows; then
   TF_NEED_HDFS=0
   TF_NEED_JEMALLOC=0
   TF_NEED_OPENCL=0
+  TF_CUDA_CLANG=0
 fi
 
 if is_linux; then
@@ -181,9 +186,8 @@ else
   TF_NEED_JEMALLOC=0
 fi
 
-sed_hyphen_i -e "/with_jemalloc/d" .bazelrc
 if [[ "$TF_NEED_JEMALLOC" == "1" ]]; then
-  echo 'build --define with_jemalloc=true' >>.bazelrc
+  write_to_bazelrc 'build --define with_jemalloc=true'
 fi
 
 while [[ "$TF_NEED_GCP" == "" ]]; do
@@ -200,9 +204,8 @@ while [[ "$TF_NEED_GCP" == "" ]]; do
   esac
 done
 
-sed_hyphen_i -e "/with_gcp_support/d" .bazelrc
 if [[ "$TF_NEED_GCP" == "1" ]]; then
-  echo 'build --define with_gcp_support=true' >>.bazelrc
+  write_to_bazelrc 'build --define with_gcp_support=true'
 fi
 
 while [[ "$TF_NEED_HDFS" == "" ]]; do
@@ -219,9 +222,8 @@ while [[ "$TF_NEED_HDFS" == "" ]]; do
   esac
 done
 
-sed_hyphen_i -e "/with_hdfs_support/d" .bazelrc
 if [[ "$TF_NEED_HDFS" == "1" ]]; then
-  echo 'build --define with_hdfs_support=true' >>.bazelrc
+  write_to_bazelrc 'build --define with_hdfs_support=true'
 fi
 
 ## Enable XLA.
@@ -235,9 +237,8 @@ while [[ "$TF_ENABLE_XLA" == "" ]]; do
   esac
 done
 
-sed_hyphen_i -e "/with_xla_support/d" .bazelrc
 if [[ "$TF_ENABLE_XLA" == "1" ]]; then
-  echo 'build --define with_xla_support=true' >>.bazelrc
+  write_to_bazelrc 'build --define with_xla_support=true'
 fi
 
 
@@ -279,23 +280,11 @@ while [ "$TF_NEED_CUDA" == "" ]; do
   esac
 done
 
-sed_hyphen_i -e "/--action_env TF_NEED_CUDA/d" .bazelrc
-sed_hyphen_i -e "/--action_env CUD/d" .bazelrc
-sed_hyphen_i -e "/--action_env GCC_HOST/d" .bazelrc
-sed_hyphen_i -e "/--action_env TF_CUD/d" .bazelrc
-sed_hyphen_i -e "/--action_env CLANG_CUDA/d" .bazelrc
-
 export TF_NEED_CUDA
-echo "build --action_env TF_NEED_CUDA=$TF_NEED_CUDA" >>.bazelrc
+write_action_env_to_bazelrc "TF_NEED_CUDA" "$TF_NEED_CUDA"
 
 export TF_NEED_OPENCL
 
-if [[ "$TF_NEED_CUDA" == "0" ]] && [[ "$TF_NEED_OPENCL" == "0" ]]; then
-  echo "Configuration finished"
-  bazel_fetch
-  exit
-fi
-
 if [ "$TF_NEED_CUDA" == "1" ]; then
 while [[ "$TF_CUDA_CLANG" == "" ]]; do
   read -p "Do you want to use clang as CUDA compiler? [y/N] " INPUT
@@ -308,7 +297,7 @@ while [[ "$TF_CUDA_CLANG" == "" ]]; do
 done
 
 export TF_CUDA_CLANG
-echo "build --action_env TF_CUDA_CLANG=$TF_CUDA_CLANG" >>.bazelrc
+write_action_env_to_bazelrc "TF_CUDA_CLANG" "$TF_CUDA_CLANG"
 
 # Set up which gcc nvcc should use as the host compiler
 # No need to set this on Windows
@@ -324,7 +313,7 @@ while [[ "$TF_CUDA_CLANG" != "1" ]] && ! is_windows && true; do
   fi
   if [ -e "$GCC_HOST_COMPILER_PATH" ]; then
     export GCC_HOST_COMPILER_PATH
-    echo "build --action_env GCC_HOST_COMPILER_PATH=\"$GCC_HOST_COMPILER_PATH\"" >>.bazelrc
+    write_action_env_to_bazelrc "GCC_HOST_COMPILER_PATH" "$GCC_HOST_COMPILER_PATH"
     break
   fi
   echo "Invalid gcc path. ${GCC_HOST_COMPILER_PATH} cannot be found" 1>&2
@@ -348,7 +337,7 @@ while [[ "$TF_CUDA_CLANG" == "1" ]] && true; do
   fi
   if [ -e "$CLANG_CUDA_COMPILER_PATH" ]; then
     export CLANG_CUDA_COMPILER_PATH
-    echo "build --action_env CLANG_CUDA_COMPILER_PATH=\"$CLANG_CUDA_COMPILER_PATH\"" >>.bazelrc
+    write_action_env_to_bazelrc "CLANG_CUDA_COMPILER_PATH" "$CLANG_CUDA_COMPILER_PATH"
     break
   fi
   echo "Invalid clang path. ${CLANG_CUDA_COMPILER_PATH} cannot be found" 1>&2
@@ -399,10 +388,9 @@ while true; do
 
   if [ -e "${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH}" ]; then
     export CUDA_TOOLKIT_PATH
-    echo "build --action_env CUDA_TOOLKIT_PATH=\"$CUDA_TOOLKIT_PATH\"" >>.bazelrc
-
+    write_action_env_to_bazelrc "CUDA_TOOLKIT_PATH" "$CUDA_TOOLKIT_PATH"
     export TF_CUDA_VERSION
-    echo "build --action_env TF_CUDA_VERSION=$TF_CUDA_VERSION" >>.bazelrc
+    write_action_env_to_bazelrc "TF_CUDA_VERSION" "$TF_CUDA_VERSION"
     break
   fi
   echo "Invalid path to CUDA $TF_CUDA_VERSION toolkit. ${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH} cannot be found"
@@ -417,9 +405,9 @@ done
 
 # Find out where the cuDNN library is installed
 while true; do
-  # Configure the Cudnn version to use.
+  # Configure the cuDNN version to use.
   if [ -z "$TF_CUDNN_VERSION" ]; then
-    read -p "Please specify the Cudnn version you want to use. [Leave empty to use system default]: " TF_CUDNN_VERSION
+    read -p "Please specify the cuDNN version you want to use. [Leave empty to use system default]: " TF_CUDNN_VERSION
   fi
 
   fromuser=""
@@ -454,10 +442,9 @@ while true; do
 
   if [ -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_ALT_PATH}" -o -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_PATH}" ]; then
     export TF_CUDNN_VERSION
-    echo "build --action_env TF_CUDNN_VERSION=$TF_CUDNN_VERSION" >>.bazelrc
-
+    write_action_env_to_bazelrc "TF_CUDNN_VERSION" "$TF_CUDNN_VERSION"
     export CUDNN_INSTALL_PATH
-    echo "build --action_env CUDNN_INSTALL_PATH=\"$CUDNN_INSTALL_PATH\"" >>.bazelrc
+    write_action_env_to_bazelrc "CUDNN_INSTALL_PATH" "$CUDNN_INSTALL_PATH"
     break
   fi
 
@@ -470,10 +457,9 @@ while true; do
     CUDNN_PATH_FROM_LDCONFIG="$($LDCONFIG_BIN -p | sed -n 's/.*libcudnn.so .* => \(.*\)/\1/p')"
     if [ -e "${CUDNN_PATH_FROM_LDCONFIG}${TF_CUDNN_EXT}" ]; then
       export TF_CUDNN_VERSION
-      echo "build --action_env TF_CUDNN_VERSION=$TF_CUDNN_VERSION" >>.bazelrc
-
+      write_action_env_to_bazelrc "TF_CUDNN_VERSION" "$TF_CUDNN_VERSION"
       export CUDNN_INSTALL_PATH="$(dirname ${CUDNN_PATH_FROM_LDCONFIG})"
-      echo "build --action_env CUDNN_INSTALL_PATH=\"$CUDNN_INSTALL_PATH\"" >>.bazelrc
+      write_action_env_to_bazelrc "CUDNN_INSTALL_PATH" "$CUDNN_INSTALL_PATH"
       break
     fi
   fi
@@ -525,7 +511,7 @@ EOF
     fi
   else
     export TF_CUDA_COMPUTE_CAPABILITIES
-    echo "build --action_env TF_CUDA_COMPUTE_CAPABILITIES=$TF_CUDA_COMPUTE_CAPABILITIES" >>.bazelrc
+    write_action_env_to_bazelrc "TF_CUDA_COMPUTE_CAPABILITIES" "$TF_CUDA_COMPUTE_CAPABILITIES"
     break
   fi
   TF_CUDA_COMPUTE_CAPABILITIES=""
@@ -536,9 +522,9 @@ if is_windows; then
   export CUDA_PATH="$CUDA_TOOLKIT_PATH"
   export CUDA_COMPUTE_CAPABILITIES="$TF_CUDA_COMPUTE_CAPABILITIES"
   export NO_WHOLE_ARCHIVE_OPTION=1
-
-  # Set GCC_HOST_COMPILER_PATH to keep cuda_configure.bzl happy
-  export GCC_HOST_COMPILER_PATH="/usr/bin/dummy_compiler"
+  write_action_env_to_bazelrc "CUDA_PATH" "$CUDA_PATH"
+  write_action_env_to_bazelrc "CUDA_COMPUTE_CAPABILITIES" "$CUDA_COMPUTE_CAPABILITIES"
+  write_action_env_to_bazelrc "NO_WHOLE_ARCHIVE_OPTION" "1"
 fi
 
 # end of if "$TF_NEED_CUDA" == "1"
@@ -629,6 +615,6 @@ done
 # end of if "$TF_NEED_OPENCL" == "1"
 fi
 
-bazel_fetch
-
+# TODO(gunan): Remove once bazel correctly handles changes in remote repositories.
+bazel clean
 echo "Configuration finished"
diff --git a/tensorflow/compiler/aot/tests/make_test_graphs.py b/tensorflow/compiler/aot/tests/make_test_graphs.py
index 6981cb6757..98c13958d3 100644
--- a/tensorflow/compiler/aot/tests/make_test_graphs.py
+++ b/tensorflow/compiler/aot/tests/make_test_graphs.py
@@ -72,7 +72,7 @@ def tfadd_with_ckpt_saver(out_dir):
     saver.save(sess, ckpt_file)
     # Without the SaverDef, the restore op won't be named correctly.
     saver_file = '%s/test_graph_tfadd_with_ckpt_saver.saver' % out_dir
-    with open(saver_file, 'w') as f:
+    with open(saver_file, 'wb') as f:
       f.write(saver.as_saver_def().SerializeToString())
 
 
@@ -113,7 +113,7 @@ def write_graph(build_graph, out_dir):
   with g.as_default():
     build_graph(out_dir)
     filename = '%s/test_graph_%s.pb' % (out_dir, build_graph.__name__)
-    with open(filename, 'w') as f:
+    with open(filename, 'wb') as f:
       f.write(g.as_graph_def().SerializeToString())
 
 
diff --git a/tensorflow/compiler/tests/nary_ops_test.py b/tensorflow/compiler/tests/nary_ops_test.py
index a1f1e67a9f..2660e1d572 100644
--- a/tensorflow/compiler/tests/nary_ops_test.py
+++ b/tensorflow/compiler/tests/nary_ops_test.py
@@ -116,12 +116,14 @@ class NAryOpsTest(XLATestCase):
                     np.array([1, 1], dtype=np.int32)],
                    expected=np.array([[], []], dtype=np.float32))
 
-    self._testNAry(lambda x: array_ops.strided_slice(*x),
-                   [np.array([[], [], []], dtype=np.float32),
-                    np.array([1, 0], dtype=np.int64),
-                    np.array([3, 0], dtype=np.int64),
-                    np.array([1, 1], dtype=np.int64)],
-                   expected=np.array([[], []], dtype=np.float32))
+    if np.int64 in self.int_types:
+      self._testNAry(
+          lambda x: array_ops.strided_slice(*x), [
+              np.array([[], [], []], dtype=np.float32), np.array(
+                  [1, 0], dtype=np.int64), np.array([3, 0], dtype=np.int64),
+              np.array([1, 1], dtype=np.int64)
+          ],
+          expected=np.array([[], []], dtype=np.float32))
 
     self._testNAry(lambda x: array_ops.strided_slice(*x),
                    [np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
diff --git a/tensorflow/compiler/tests/pooling_ops_3d_test.py b/tensorflow/compiler/tests/pooling_ops_3d_test.py
index 4eed903963..eb48fe555a 100644
--- a/tensorflow/compiler/tests/pooling_ops_3d_test.py
+++ b/tensorflow/compiler/tests/pooling_ops_3d_test.py
@@ -33,7 +33,7 @@ from tensorflow.python.platform import test
 # MaxPoolGrad.
 def _AvgPoolGrad(inputs, outputs, output_gradients, ksize, strides, padding):
   del outputs  # Unused by average-pooling gradients.
-  return gen_nn_ops.avg_pool3d_grad(
+  return gen_nn_ops._avg_pool3d_grad(
       inputs.get_shape().as_list(),
       output_gradients,
       ksize=ksize,
@@ -263,7 +263,7 @@ class Pooling3DTest(XLATestCase):
   def testMaxPoolGradValidPadding1_1_3d(self):
     self._VerifyGradient(
         nn_ops.max_pool3d,
-        gen_nn_ops.max_pool3d_grad,
+        gen_nn_ops._max_pool3d_grad,
         input_sizes=[1, 3, 3, 3, 1],
         ksize=[1, 1, 1],
         strides=[1, 1, 1],
@@ -272,7 +272,7 @@ class Pooling3DTest(XLATestCase):
   def testMaxPoolGradValidPadding2_1_6_3d(self):
     self._VerifyGradient(
         nn_ops.max_pool3d,
-        gen_nn_ops.max_pool3d_grad,
+        gen_nn_ops._max_pool3d_grad,
         input_sizes=[2, 3, 3, 6, 3],
         ksize=[2, 2, 2],
         strides=[1, 1, 1],
@@ -281,7 +281,7 @@ class Pooling3DTest(XLATestCase):
   def testMaxPoolGradValidPadding2_1_7_3d(self):
     self._VerifyGradient(
         nn_ops.max_pool3d,
-        gen_nn_ops.max_pool3d_grad,
+        gen_nn_ops._max_pool3d_grad,
         input_sizes=[2, 3, 5, 7, 3],
         ksize=[2, 2, 2],
         strides=[1, 1, 1],
@@ -290,7 +290,7 @@ class Pooling3DTest(XLATestCase):
   def testMaxPoolGradValidPadding2_2_3d(self):
     self._VerifyGradient(
         nn_ops.max_pool3d,
-        gen_nn_ops.max_pool3d_grad,
+        gen_nn_ops._max_pool3d_grad,
         input_sizes=[2, 2, 2, 2, 3],
         ksize=[2, 2, 2],
         strides=[2, 2, 2],
@@ -299,7 +299,7 @@ class Pooling3DTest(XLATestCase):
   def testMaxPoolGradSamePadding1_1_3d(self):
     self._VerifyGradient(
         nn_ops.max_pool3d,
-        gen_nn_ops.max_pool3d_grad,
+        gen_nn_ops._max_pool3d_grad,
         input_sizes=[2, 3, 2, 4, 1],
         ksize=[1, 1, 1],
         strides=[1, 1, 1],
@@ -308,7 +308,7 @@ class Pooling3DTest(XLATestCase):
   def testMaxPoolGradSamePadding2_1_3d(self):
     self._VerifyGradient(
         nn_ops.max_pool3d,
-        gen_nn_ops.max_pool3d_grad,
+        gen_nn_ops._max_pool3d_grad,
         input_sizes=[2, 3, 2, 4, 1],
         ksize=[2, 2, 2],
         strides=[1, 1, 1],
@@ -317,7 +317,7 @@ class Pooling3DTest(XLATestCase):
   def testMaxPoolGradSamePadding2_2_3d(self):
     self._VerifyGradient(
         nn_ops.max_pool3d,
-        gen_nn_ops.max_pool3d_grad,
+        gen_nn_ops._max_pool3d_grad,
         input_sizes=[2, 5, 2, 4, 3],
         ksize=[2, 2, 2],
         strides=[2, 2, 2],
@@ -326,7 +326,7 @@ class Pooling3DTest(XLATestCase):
   def testMaxPoolGradSamePadding3_1_3d(self):
     self._VerifyGradient(
         nn_ops.max_pool3d,
-        gen_nn_ops.max_pool3d_grad,
+        gen_nn_ops._max_pool3d_grad,
         input_sizes=[1, 3, 3, 7, 1],
         ksize=[3, 3, 3],
         strides=[1, 1, 1],
diff --git a/tensorflow/contrib/android/cmake/build.gradle b/tensorflow/contrib/android/cmake/build.gradle
index fb87de6212..17a57b99fd 100644
--- a/tensorflow/contrib/android/cmake/build.gradle
+++ b/tensorflow/contrib/android/cmake/build.gradle
@@ -5,7 +5,8 @@ def TF_SRC_DIR = projectDir.toString() + "/../../../.."
 
 android {
     compileSdkVersion 24
-    buildToolsVersion '25.0.1'
+    // Check local build_tools_version as this is liable to change within Android Studio.
+    buildToolsVersion '25.0.2'
 
     // for debugging native code purpose
     publishNonDefault true
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 3c8dc869af..e27df6898e 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -22,6 +22,7 @@ option(tensorflow_ENABLE_GPU "Enable GPU support" OFF)
 option(tensorflow_ENABLE_SSL_SUPPORT "Enable boringssl support" OFF)
 option(tensorflow_ENABLE_GRPC_SUPPORT "Enable gRPC support" ON)
 option(tensorflow_ENABLE_HDFS_SUPPORT "Enable HDFS support" OFF)
+option(tensorflow_ENABLE_JEMALLOC_SUPPORT "Enable jemalloc support" OFF)
 option(tensorflow_BUILD_CC_EXAMPLE "Build the C++ tutorial example" ON)
 option(tensorflow_BUILD_PYTHON_BINDINGS "Build the Python bindings" ON)
 option(tensorflow_BUILD_ALL_KERNELS "Build all OpKernels" ON)
@@ -29,6 +30,7 @@ option(tensorflow_BUILD_CONTRIB_KERNELS "Build OpKernels from tensorflow/contrib
 option(tensorflow_BUILD_CC_TESTS "Build cc unit tests " OFF)
 option(tensorflow_BUILD_PYTHON_TESTS "Build python unit tests " OFF)
 option(tensorflow_OPTIMIZE_FOR_NATIVE_ARCH "Enable compiler optimizations for the native processor architecture (if available)" ON)
+option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions")
 
 if (NOT WIN32)
   # Threads: defines CMAKE_THREAD_LIBS_INIT and adds -pthread compile option
@@ -81,6 +83,22 @@ if (tensorflow_OPTIMIZE_FOR_NATIVE_ARCH)
   endif()
 endif()
 
+# MSVC SIMD instructions
+if (tensorflow_WIN_CPU_SIMD_OPTIONS)
+  if (WIN32)
+    CHECK_CXX_COMPILER_FLAG("${tensorflow_WIN_CPU_SIMD_OPTIONS}" COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
+    if(COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${tensorflow_WIN_CPU_SIMD_OPTIONS}")
+    else()
+      message(FATAL_ERROR "${tensorflow_WIN_CPU_SIMD_OPTIONS} not supported")
+    endif()
+  endif()
+endif()
+
+if (tensorflow_ENABLE_JEMALLOC_SUPPORT)
+  add_definitions(-DTENSORFLOW_USE_JEMALLOC -DJEMALLOC_EXPORT=)
+endif()
+
 # External dependencies
 include(zlib)
 include(gif)
@@ -148,6 +166,12 @@ if(tensorflow_ENABLE_GRPC_SUPPORT)
   list(APPEND tensorflow_EXTERNAL_DEPENDENCIES grpc)
   include_directories(${GRPC_INCLUDE_DIRS})
 endif()
+if(tensorflow_ENABLE_JEMALLOC_SUPPORT)
+  include(jemalloc)
+  list(APPEND tensorflow_EXTERNAL_LIBRARIES ${jemalloc_STATIC_LIBRARIES})
+  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES jemalloc)
+  include_directories(${jemalloc_INCLUDE_DIRS})
+endif()
 if(WIN32)
   list(APPEND tensorflow_EXTERNAL_LIBRARIES wsock32 ws2_32 shlwapi)
 endif()
@@ -202,7 +226,6 @@ endif()
 
 # Let's get to work!
 include(tf_core_framework.cmake)
-include(tf_tools.cmake)
 # NOTE: Disabled until issue #3996 is fixed.
 # include(tf_stream_executor.cmake)
 if (tensorflow_ENABLE_GPU)
@@ -223,6 +246,7 @@ if(tensorflow_BUILD_CC_EXAMPLE)
   include(tf_tutorials.cmake)
   include(tf_label_image_example.cmake)
 endif()
+include(tf_tools.cmake)
 if(tensorflow_BUILD_PYTHON_BINDINGS)
   include(tensorboard)
   include(tf_python.cmake)
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index 2641d5292d..af949f79fa 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -45,7 +45,7 @@ bindings.
 
 ### Pre-requisites
 
-* CMake version 3.5 up to 3.6
+* CMake version 3.5 or later.
 
 * [Git](http://git-scm.com)
 
@@ -181,7 +181,11 @@ Step-by-step Windows build
    More? -Dtensorflow_ENABLE_GPU=ON ^
    More? -DCUDNN_HOME="D:\...\cudnn"
    ```
-    
+   To enable SIMD instructions with MSVC, as AVX and SSE, define it as follows:
+   ```
+   More? -Dtensorflow_WIN_CPU_SIMD_OPTIONS=/arch:AVX
+   ```     
+
    Note that the `-DCMAKE_BUILD_TYPE=Release` flag must match the build
    configuration that you choose when invoking `msbuild`. The known-good
    values are `Release` and `RelWithDebInfo`. The `Debug` build type is
diff --git a/tensorflow/contrib/cmake/external/jemalloc.cmake b/tensorflow/contrib/cmake/external/jemalloc.cmake
new file mode 100644
index 0000000000..b0b212eeb6
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/jemalloc.cmake
@@ -0,0 +1,33 @@
+include (ExternalProject)
+
+set(jemalloc_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/jemalloc/src/jemalloc/include)
+set(jemalloc_URL https://github.com/jemalloc/jemalloc-cmake/archive/jemalloc-cmake.4.3.1.tar.gz)
+set(jemalloc_HASH SHA256=f9be9a05fe906deb5c1c8ca818071a7d2e27d66fd87f5ba9a7bf3750bcedeaf0)
+set(jemalloc_BUILD ${CMAKE_CURRENT_BINARY_DIR}/jemalloc/src/jemalloc)
+
+if (WIN32)
+    set(jemalloc_INCLUDE_DIRS
+        ${jemalloc_INCLUDE_DIRS} 
+        ${CMAKE_CURRENT_BINARY_DIR}/jemalloc/src/jemalloc/include/msvc_compat
+    )
+    set(jemalloc_ADDITIONAL_CMAKE_OPTIONS -A x64)
+    set(jemalloc_STATIC_LIBRARIES ${jemalloc_BUILD}/Release/jemalloc.lib)
+else()
+    set(jemalloc_STATIC_LIBRARIES ${jemalloc_BUILD}/Release/jemalloc.a)
+endif()
+
+ExternalProject_Add(jemalloc
+    PREFIX jemalloc
+    URL ${jemalloc_URL}
+    URL_HASH ${jemalloc_HASH}
+    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+    BUILD_IN_SOURCE 1
+    CONFIGURE_COMMAND ${CMAKE_COMMAND}
+        -DCMAKE_BUILD_TYPE:STRING=Release
+        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+        -Dwith-jemalloc-prefix:STRING=jemalloc_
+        -Dwithout-export:BOOL=ON
+        ${jemalloc_ADDITIONAL_CMAKE_OPTIONS}
+    BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release --target jemalloc
+    INSTALL_COMMAND ${CMAKE_COMMAND} -E echo "Skipping install step."
+)
diff --git a/tensorflow/contrib/cmake/tf_tools.cmake b/tensorflow/contrib/cmake/tf_tools.cmake
index 5151fdb444..636caf5f3d 100644
--- a/tensorflow/contrib/cmake/tf_tools.cmake
+++ b/tensorflow/contrib/cmake/tf_tools.cmake
@@ -63,7 +63,6 @@ add_executable(${transform_graph}
 
 target_link_libraries(${transform_graph} PUBLIC
   tf_protos_cc
-  ${tf_core_gpu_kernels_lib}
   ${tensorflow_EXTERNAL_LIBRARIES}
 )
 
@@ -83,7 +82,6 @@ add_executable(${summarize_graph}
 
 target_link_libraries(${summarize_graph} PUBLIC
   tf_protos_cc
-  ${tf_core_gpu_kernels_lib}
   ${tensorflow_EXTERNAL_LIBRARIES}
 )
 
@@ -103,7 +101,6 @@ add_executable(${compare_graphs}
 
 target_link_libraries(${compare_graphs} PUBLIC
   tf_protos_cc
-  ${tf_core_gpu_kernels_lib}
   ${tensorflow_EXTERNAL_LIBRARIES}
 )
 
@@ -118,6 +115,8 @@ add_executable(${benchmark_model}
     $<TARGET_OBJECTS:tf_core_ops>
     $<TARGET_OBJECTS:tf_core_direct_session>
     $<TARGET_OBJECTS:tf_core_kernels>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )
 
 target_link_libraries(${benchmark_model} PUBLIC
diff --git a/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.h b/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.h
index 0aefbc6eed..df744428a8 100644
--- a/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.h
+++ b/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.h
@@ -29,7 +29,6 @@
   dispatch_queue_t videoDataOutputQueue;
   AVCaptureStillImageOutput *stillImageOutput;
   UIView *flashView;
-  UIImage *square;
   BOOL isUsingFrontFacingCamera;
   AVSpeechSynthesizer *synth;
   NSMutableDictionary *oldPredictionValues;
diff --git a/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.mm b/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.mm
index e975a25b5e..20c49d5b6a 100644
--- a/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.mm
+++ b/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.mm
@@ -369,13 +369,8 @@ didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
   isUsingFrontFacingCamera = !isUsingFrontFacingCamera;
 }
 
-- (void)didReceiveMemoryWarning {
-  [super didReceiveMemoryWarning];
-}
-
 - (void)viewDidLoad {
   [super viewDidLoad];
-  square = [UIImage imageNamed:@"squarePNG"];
   synth = [[AVSpeechSynthesizer alloc] init];
   labelLayers = [[NSMutableArray alloc] init];
   oldPredictionValues = [[NSMutableDictionary alloc] init];
@@ -399,26 +394,6 @@ didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
   [self setupAVCapture];
 }
 
-- (void)viewDidUnload {
-  [super viewDidUnload];
-}
-
-- (void)viewWillAppear:(BOOL)animated {
-  [super viewWillAppear:animated];
-}
-
-- (void)viewDidAppear:(BOOL)animated {
-  [super viewDidAppear:animated];
-}
-
-- (void)viewWillDisappear:(BOOL)animated {
-  [super viewWillDisappear:animated];
-}
-
-- (void)viewDidDisappear:(BOOL)animated {
-  [super viewDidDisappear:animated];
-}
-
 - (BOOL)shouldAutorotateToInterfaceOrientation:
     (UIInterfaceOrientation)interfaceOrientation {
   return (interfaceOrientation == UIInterfaceOrientationPortrait);
diff --git a/tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj/project.pbxproj b/tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj/project.pbxproj
index 1134d0e117..e9d783e49d 100644
--- a/tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj/project.pbxproj
+++ b/tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj/project.pbxproj
@@ -13,7 +13,6 @@
 		591D3ECF1CFF7FCE0059011C /* ImageIO.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 591D3ECE1CFF7FCE0059011C /* ImageIO.framework */; };
 		591D3ED21CFF85C30059011C /* ios_image_load.mm in Sources */ = {isa = PBXBuildFile; fileRef = 591D3ED11CFF85C30059011C /* ios_image_load.mm */; };
 		591D3ED51CFF85FD0059011C /* tensorflow_utils.mm in Sources */ = {isa = PBXBuildFile; fileRef = 591D3ED31CFF85FD0059011C /* tensorflow_utils.mm */; };
-		591D3EDA1CFFA83A0059011C /* grace_hopper.jpg in Resources */ = {isa = PBXBuildFile; fileRef = 591D3ED71CFFA83A0059011C /* grace_hopper.jpg */; };
 		591D3EDB1CFFA83A0059011C /* imagenet_comp_graph_label_strings.txt in Resources */ = {isa = PBXBuildFile; fileRef = 591D3ED81CFFA83A0059011C /* imagenet_comp_graph_label_strings.txt */; };
 		591D3EDC1CFFA83A0059011C /* tensorflow_inception_graph.pb in Resources */ = {isa = PBXBuildFile; fileRef = 591D3ED91CFFA83A0059011C /* tensorflow_inception_graph.pb */; };
 		591D3EDF1CFFAD230059011C /* libprotobuf-lite.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 591D3EDD1CFFAD230059011C /* libprotobuf-lite.a */; };
@@ -38,7 +37,6 @@
 		591D3ED11CFF85C30059011C /* ios_image_load.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = ios_image_load.mm; sourceTree = SOURCE_ROOT; };
 		591D3ED31CFF85FD0059011C /* tensorflow_utils.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = tensorflow_utils.mm; sourceTree = SOURCE_ROOT; };
 		591D3ED41CFF85FD0059011C /* tensorflow_utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tensorflow_utils.h; sourceTree = SOURCE_ROOT; };
-		591D3ED71CFFA83A0059011C /* grace_hopper.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = grace_hopper.jpg; sourceTree = "<group>"; };
 		591D3ED81CFFA83A0059011C /* imagenet_comp_graph_label_strings.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = imagenet_comp_graph_label_strings.txt; sourceTree = "<group>"; };
 		591D3ED91CFFA83A0059011C /* tensorflow_inception_graph.pb */ = {isa = PBXFileReference; lastKnownFileType = file; path = tensorflow_inception_graph.pb; sourceTree = "<group>"; };
 		591D3EDD1CFFAD230059011C /* libprotobuf-lite.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "libprotobuf-lite.a"; path = "../../makefile/gen/protobuf_ios/lib/libprotobuf-lite.a"; sourceTree = "<group>"; };
@@ -79,7 +77,6 @@
 		591D3ED61CFFA83A0059011C /* data */ = {
 			isa = PBXGroup;
 			children = (
-				591D3ED71CFFA83A0059011C /* grace_hopper.jpg */,
 				591D3ED81CFFA83A0059011C /* imagenet_comp_graph_label_strings.txt */,
 				591D3ED91CFFA83A0059011C /* tensorflow_inception_graph.pb */,
 			);
@@ -199,7 +196,6 @@
 			isa = PBXResourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				591D3EDA1CFFA83A0059011C /* grace_hopper.jpg in Resources */,
 				591D3EDC1CFFA83A0059011C /* tensorflow_inception_graph.pb in Resources */,
 				592FF90D18EDD0DA00C164F8 /* MainStoryboard_iPhone.storyboard in Resources */,
 				591D3EDB1CFFA83A0059011C /* imagenet_comp_graph_label_strings.txt in Resources */,
diff --git a/tensorflow/contrib/ios_examples/camera/data/grace_hopper.jpg b/tensorflow/contrib/ios_examples/camera/data/grace_hopper.jpg
deleted file mode 100644
index d2a427810f..0000000000
--- a/tensorflow/contrib/ios_examples/camera/data/grace_hopper.jpg
+++ /dev/null
diff --git a/tensorflow/contrib/ios_examples/camera/squarePNG.png b/tensorflow/contrib/ios_examples/camera/squarePNG.png
deleted file mode 100644
index e26ff840ed..0000000000
--- a/tensorflow/contrib/ios_examples/camera/squarePNG.png
+++ /dev/null
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops.py b/tensorflow/contrib/layers/python/layers/embedding_ops.py
index f0ed31d1d1..b1a7f7ee59 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops.py
@@ -17,24 +17,31 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
 from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
 from tensorflow.contrib.layers.python.ops import sparse_feature_cross_op
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 
 __all__ = [
     "safe_embedding_lookup_sparse", "scattered_embedding_lookup",
-    "scattered_embedding_lookup_sparse", "embedding_lookup_unique"
+    "scattered_embedding_lookup_sparse", "embedding_lookup_unique",
+    "embedding_lookup_sparse_with_distributed_aggregation"
 ]
 
 
@@ -548,3 +555,351 @@ def _sampled_scattered_embedding_lookup_sparse(params,
     return math_ops.unsorted_segment_sum(embeddings, segment_ids,
                                          num_segments=num_segments,
                                          name=name_scope)
+
+
+def embedding_lookup_sparse_with_distributed_aggregation(
+    params,
+    sp_ids,
+    sp_weights,
+    partition_strategy="mod",
+    name=None,
+    combiner=None,
+    max_norm=None):
+  """Computes embeddings for the given ids and weights.
+
+  Embeddings belonging to same param are aggregated on that device first. This
+  op is intended to decrease data transmission and improve parallelism. See
+  `tf.nn.embedding_lookup_sparse` for the functionality and example of this op.
+
+  Args:
+    params: A single tensor representing the complete embedding tensor,
+      or a list of P tensors all of same shape except for the first dimension,
+      representing sharded embedding tensors.  Alternatively, a
+      `PartitionedVariable`, created by partitioning along dimension 0. Each
+      element must be appropriately sized for the given `partition_strategy`.
+    sp_ids: N x M SparseTensor of int64 ids (typically from FeatureValueToId),
+      where N is typically batch size and M is arbitrary.
+    sp_weights: either a SparseTensor of float / double weights, or None to
+      indicate all weights should be taken to be 1. If specified, sp_weights
+      must have exactly the same shape and indices as sp_ids.
+    partition_strategy: A string specifying the partitioning strategy, relevant
+      if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
+      is `"mod"`. See `tf.nn.embedding_lookup` for more details.
+    name: Optional name for the op.
+    combiner: A string specifying the reduction op. Currently "mean", "sqrtn"
+      and "sum" are supported.
+      "sum" computes the weighted sum of the embedding results for each row.
+      "mean" is the weighted sum divided by the total weight.
+      "sqrtn" is the weighted sum divided by the square root of the sum of the
+      squares of the weights.
+    max_norm: If not None, each embedding is normalized to have l2 norm equal
+      to max_norm before combining.
+
+  Returns:
+    A dense tensor representing the combined embeddings for the
+    sparse ids. For each row in the dense tensor represented by sp_ids, the op
+    looks up the embeddings for all ids in that row, multiplies them by the
+    corresponding weight, and combines these embeddings as specified.
+
+  Raises:
+    TypeError: If sp_ids is not a SparseTensor, or if sp_weights is neither
+      None nor SparseTensor.
+    ValueError: If combiner is not one of {"mean", "sqrtn", "sum"}.
+  """
+  if combiner is None:
+    logging.warn("The default value of combiner will change from \"mean\" "
+                 "to \"sqrtn\" after 2016/11/01.")
+    combiner = "mean"
+  if combiner not in ("mean", "sqrtn", "sum"):
+    raise ValueError("combiner must be one of 'mean', 'sqrtn' or 'sum'")
+  if isinstance(params, variables.PartitionedVariable):
+    params = list(params)  # Iterate to get the underlying Variables.
+  if not isinstance(params, list):
+    params = [params]
+  if not isinstance(sp_ids, sparse_tensor.SparseTensor):
+    raise TypeError("sp_ids must be SparseTensor")
+  ignore_weights = sp_weights is None
+  if not ignore_weights:
+    if not isinstance(sp_weights, sparse_tensor.SparseTensor):
+      raise TypeError("sp_weights must be either None or SparseTensor")
+    sp_ids.values.get_shape().assert_is_compatible_with(
+        sp_weights.values.get_shape())
+    sp_ids.indices.get_shape().assert_is_compatible_with(
+        sp_weights.indices.get_shape())
+    sp_ids.dense_shape.get_shape().assert_is_compatible_with(
+        sp_weights.dense_shape.get_shape())
+    # TODO(yleon): Add enhanced node assertions to verify that sp_ids and
+    # sp_weights have equal indices and shapes.
+
+  with ops.name_scope(name, "embedding_lookup_sparse",
+                      params + [sp_ids]) as name:
+    segment_ids = sp_ids.indices[:, 0]
+    if segment_ids.dtype != dtypes.int32:
+      segment_ids = math_ops.cast(segment_ids, dtypes.int32)
+
+    ids = sp_ids.values
+    if ignore_weights:
+      ids, idx = array_ops.unique(ids)
+    else:
+      idx = None
+
+    weights = None if ignore_weights else sp_weights.values
+    embeddings = _embedding_lookup_with_distributed_aggregation(
+        params,
+        ids,
+        partition_strategy=partition_strategy,
+        max_norm=max_norm,
+        weights=weights,
+        idx=idx,
+        segment_ids=segment_ids)
+    # Set weights to all one if ignore weights.
+    if ignore_weights:
+      weights = array_ops.fill([array_ops.shape(segment_ids)[0]], 1)
+    if weights.dtype != embeddings.dtype:
+      weights = math_ops.cast(weights, embeddings.dtype)
+    # Reshape weights.
+    ones = array_ops.fill(
+        array_ops.expand_dims(array_ops.rank(embeddings) - 1, 0), 1)
+    bcast_weights_shape = array_ops.concat([array_ops.shape(weights), ones], 0)
+    orig_weights_shape = weights.get_shape()
+    weights = array_ops.reshape(weights, bcast_weights_shape)
+    if embeddings.get_shape().ndims is not None:
+      weights.set_shape(
+          orig_weights_shape.concatenate(
+              [1 for _ in range(embeddings.get_shape().ndims - 1)]))
+
+    if combiner == "mean":
+      weight_sum = math_ops.segment_sum(weights, segment_ids)
+      embeddings = math_ops.div(embeddings, weight_sum)
+    elif combiner == "sqrtn":
+      weights_squared = math_ops.pow(weights, 2)
+      weight_sum = math_ops.segment_sum(weights_squared, segment_ids)
+      weight_sum_sqrt = math_ops.sqrt(weight_sum)
+      embeddings = math_ops.div(embeddings, weight_sum_sqrt)
+    elif combiner != "sum":
+      assert False, "Unrecognized combiner"
+    return embeddings
+
+
+def _do_gather(params, ids, validate_indices=True, name=None):
+  """Deals with doing gather differently for resource variables."""
+  if isinstance(params, resource_variable_ops.ResourceVariable):
+    return params.sparse_read(ids, name=name)
+  return array_ops.gather(
+      params, ids, name=name, validate_indices=validate_indices)
+
+
+def _embedding_lookup_with_distributed_aggregation(params,
+                                                   ids,
+                                                   partition_strategy="mod",
+                                                   name=None,
+                                                   validate_indices=True,
+                                                   max_norm=None,
+                                                   weights=None,
+                                                   idx=None,
+                                                   segment_ids=None):
+  """Lookup helper for embedding_lookup_sparse_with_distributed_aggregation."""
+  if params is None or params == []:  # pylint: disable=g-explicit-bool-comparison
+    raise ValueError("Need at least one param")
+  if isinstance(params, variables.PartitionedVariable):
+    params = list(params)  # Iterate to get the underlying Variables.
+  if not isinstance(params, list):
+    params = [params]
+
+  def maybe_normalize(x):
+    if max_norm is not None:
+      if x.get_shape().ndims is not None:
+        ndims = x.get_shape().ndims
+      else:
+        ndims = array_ops.size(array_ops.shape(x))
+      return clip_ops.clip_by_norm(x, max_norm, axes=list(range(1, ndims)))
+    return x
+
+  with ops.name_scope(name, "embedding_lookup_with_distributed_aggregation",
+                      params + [ids]) as name:
+    np = len(params)  # Number of partitions
+    # Preserve the resource variable status to avoid accidental dense reads.
+    if not any(
+        isinstance(p, resource_variable_ops.ResourceVariable) for p in params):
+      params = ops.convert_n_to_tensor_or_indexed_slices(params, name="params")
+    if np == 1:
+      with ops.colocate_with(params[0]):
+        ret = maybe_normalize(
+            _do_gather(params[0], ids, validate_indices=validate_indices))
+        ignore_weights = weights is None
+        if not ignore_weights:
+          if weights.dtype != ret.dtype:
+            weights = math_ops.cast(weights, ret.dtype)
+          # Reshape to allow broadcast
+          ones = array_ops.fill(
+              array_ops.expand_dims(array_ops.rank(ret) - 1, 0), 1)
+          bcast_weights_shape = array_ops.concat(
+              [array_ops.shape(weights), ones], 0)
+          orig_weights_shape = weights.get_shape()
+          weights = array_ops.reshape(weights, bcast_weights_shape)
+          # Set weights shape after reshape
+          if ret.get_shape().ndims is not None:
+            weights.set_shape(
+                orig_weights_shape.concatenate(
+                    [1 for _ in range(ret.get_shape().ndims - 1)]))
+          ret *= weights
+          return math_ops.segment_sum(ret, segment_ids, name=name)
+        else:
+          return math_ops.sparse_segment_sum(ret, idx, segment_ids, name=name)
+    else:
+      ids = ops.convert_to_tensor(ids, name="ids")
+      flat_ids = array_ops.reshape(ids, [-1])
+      original_indices = math_ops.range(array_ops.size(flat_ids))
+
+      # Create p_assignments and set new_ids depending on the strategy.
+      if partition_strategy == "mod":
+        p_assignments = flat_ids % np
+        new_ids = flat_ids // np
+      elif partition_strategy == "div":
+        # Compute num_total_ids as the sum of dim-0 of params, then assign to
+        # partitions based on a constant number of ids per partition. Optimize
+        # if we already know the full shape statically.
+        dim_0_size = params[0].get_shape()[0]
+        for p in xrange(1, np):
+          dim_0_size += params[p].get_shape()[0]
+        if dim_0_size.value:
+          num_total_ids = constant_op.constant(dim_0_size.value, flat_ids.dtype)
+        else:
+          dim_0_sizes = []
+          for p in xrange(np):
+            if params[p].get_shape()[0].value is not None:
+              dim_0_sizes.append(params[p].get_shape()[0].value)
+            else:
+              with ops.colocate_with(params[p]):
+                dim_0_sizes.append(array_ops.shape(params[p])[0])
+          num_total_ids = math_ops.reduce_sum(
+              math_ops.cast(array_ops.stack(dim_0_sizes), flat_ids.dtype))
+        ids_per_partition = num_total_ids // np
+        extras = num_total_ids % np
+
+        p_assignments = math_ops.maximum(flat_ids // (ids_per_partition + 1), (
+            flat_ids - extras) // ids_per_partition)
+
+        # Emulate a conditional using a boolean indicator tensor
+        is_in_first_extras_partitions = math_ops.cast(p_assignments < extras,
+                                                      flat_ids.dtype)
+        new_ids = (is_in_first_extras_partitions * (flat_ids %
+                                                    (ids_per_partition + 1)) +
+                   (1 - is_in_first_extras_partitions) * (
+                       (flat_ids - extras) % ids_per_partition))
+      else:
+        raise ValueError("Unrecognized partition strategy: " +
+                         partition_strategy)
+
+      # Cast partition assignments to int32 for use in dynamic_partition.
+      # There really should not be more than 2^32 partitions.
+      p_assignments = math_ops.cast(p_assignments, dtypes.int32)
+      # Partition list of ids based on assignments into np separate lists
+      gather_ids = data_flow_ops.dynamic_partition(new_ids, p_assignments, np)
+      # Similarly, partition the original indices.
+      pindices = data_flow_ops.dynamic_partition(original_indices,
+                                                 p_assignments, np)
+      # Do np separate lookups, finding embeddings for plist[p] in params[p]
+      partitioned_result = []
+      for p in xrange(np):
+        with ops.colocate_with(params[p]):
+          partitioned_result.append(
+              _do_gather(
+                  params[p], gather_ids[p], validate_indices=validate_indices))
+
+      ignore_weights = weights is None
+      if not ignore_weights:
+        # Partition weights according to pindices.
+        partitioned_weight = []
+        for p in xrange(np):
+          partitioned_weight.append(array_ops.gather(weights, pindices[p]))
+      # Reshape each partition result.
+      element_shape = params[0].get_shape()[1:]
+      for p in params[1:]:
+        element_shape = element_shape.merge_with(p.get_shape()[1:])
+      if element_shape.is_fully_defined():
+        for p in xrange(np):
+          with ops.colocate_with(params[p]):
+            partitioned_result[p] = array_ops.reshape(
+                partitioned_result[p],
+                array_ops.concat([array_ops.shape(pindices[p]), element_shape],
+                                 0))
+      else:
+        with ops.colocate_with(params[0]):
+          params_shape = array_ops.shape(params[0])
+        for p in xrange(np):
+          with ops.colocate_with(params[p]):
+            partitioned_result[p] = array_ops.reshape(
+                partitioned_result[p],
+                array_ops.concat([
+                    array_ops.shape(pindices[p]), array_ops.slice(
+                        params_shape, [1], [-1])
+                ], 0))
+      # Normalize each partition result.
+      for p in xrange(np):
+        with ops.colocate_with(params[p]):
+          partitioned_result[p] = maybe_normalize(partitioned_result[p])
+      if not ignore_weights:
+        # Multiply each partition result with partition weights.
+        for p in xrange(np):
+          with ops.colocate_with(params[p]):
+            if partitioned_weight[p].dtype != partitioned_result[p].dtype:
+              partitioned_weight[p] = math_ops.cast(partitioned_weight[p],
+                                                    partitioned_result[p].dtype)
+            # Reshape partition weights.
+            ones = array_ops.fill(
+                array_ops.expand_dims(
+                    array_ops.rank(partitioned_result[p]) - 1, 0), 1)
+            bcast_weights_shape = array_ops.concat(
+                [array_ops.shape(partitioned_weight[p]), ones], 0)
+            orig_weights_shape = partitioned_weight[p].get_shape()
+            partitioned_weight[p] = array_ops.reshape(partitioned_weight[p],
+                                                      bcast_weights_shape)
+            if partitioned_result[p].get_shape().ndims is not None:
+              partitioned_weight[p].set_shape(
+                  orig_weights_shape.concatenate([
+                      1
+                      for _ in range(partitioned_result[p].get_shape().ndims -
+                                     1)
+                  ]))
+            partitioned_result[p] *= partitioned_weight[p]
+      partitioned_segment_ids = []
+      for p in xrange(np):
+        if not ignore_weights:
+          # Partition segment_ids according to pindices.
+          p_segment_ids = array_ops.gather(segment_ids, pindices[p])
+          # Number the p_segment_ids to meet segment_sum's requirements. Note
+          # that unique_p_segment_ids contains unique segment ids of this
+          # partiton and these ids' order is unchanged.
+          unique_p_segment_ids, unique_p_segment_idx = array_ops.unique(
+              p_segment_ids)
+          partitioned_segment_ids.append(unique_p_segment_ids)
+          # segment_sum this partition's result.
+          with ops.colocate_with(params[p]):
+            partitioned_result[p] = math_ops.segment_sum(
+                partitioned_result[p], unique_p_segment_idx)
+        else:
+          # When ignore weights, we need to get indexs of elements in idx and
+          # segment_ids.
+          _, exclude_idx = array_ops.setdiff1d(idx, pindices[p])
+          all_idx = math_ops.range(array_ops.shape(idx)[0])
+          _, include_idx = array_ops.setdiff1d(all_idx, exclude_idx)
+          # Gather segment_ids and idx according to indexs.
+          p_segment_ids = array_ops.gather(segment_ids, include_idx)
+          p_idx = array_ops.gather(idx, include_idx)
+          # Number the p_segment_ids, same as ignore_weights case above.
+          unique_p_segment_ids, unique_p_segment_idx = array_ops.unique(
+              p_segment_ids)
+          _, unique_p_idx_idx = array_ops.unique(p_idx)
+          partitioned_segment_ids.append(unique_p_segment_ids)
+          with ops.colocate_with(params[p]):
+            partitioned_result[p] = math_ops.sparse_segment_sum(
+                partitioned_result[p], unique_p_idx_idx, unique_p_segment_idx)
+      # Concat each partition's segment_ids and result for final segment_sum.
+      concat_segment_ids = array_ops.concat(partitioned_segment_ids, 0)
+      concat_partitioned_result = array_ops.concat(partitioned_result, 0)
+      return math_ops.unsorted_segment_sum(
+          concat_partitioned_result,
+          concat_segment_ids,
+          math_ops.reduce_max(concat_segment_ids) + 1,
+          name=name)
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops_test.py b/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
index dfa8067f27..bf25144982 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
@@ -31,10 +31,13 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.platform import test
+from tensorflow.python.util import compat
 
 
 class SafeEmbeddingLookupSparseTest(test.TestCase):
@@ -143,8 +146,8 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       self.assertAllClose(
           embedding_lookup_result,
           [(embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4,
-           [0] * 4, embedding_weights[0][2],
-           (embedding_weights[0][0] + embedding_weights[0][1]) / 2.0])
+           [0] * 4, embedding_weights[0][2], (
+               embedding_weights[0][0] + embedding_weights[0][1]) / 2.0])
 
   def test_safe_embedding_lookup_sparse_partitioned(self):
     with self.test_session():
@@ -169,8 +172,8 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       self.assertRaises(ValueError, embedding_ops.safe_embedding_lookup_sparse,
                         embedding_weights, sparse_ids)
       embedding_weights = [
-          constant_op.constant(
-              w, dtype=dtypes.float64) for w in embedding_weights
+          constant_op.constant(w, dtype=dtypes.float64)
+          for w in embedding_weights
       ]
       self.assertRaises(ValueError, embedding_ops.safe_embedding_lookup_sparse,
                         embedding_weights, sparse_ids, sparse_weights)
@@ -183,11 +186,10 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
           embedding_weights, sparse_ids, sparse_weights).eval())
 
-      self.assertAllClose(
-          embedding_lookup_result,
-          [[(1.0 * embedding_weights[0][0] + 2.0 * embedding_weights[0][1]) /
-            3.0, [0] * 4, [0] * 4],
-           [embedding_weights[0][2], [0] * 4, [0] * 4]])
+      self.assertAllClose(embedding_lookup_result, [[
+          (1.0 * embedding_weights[0][0] + 2.0 * embedding_weights[0][1]) / 3.0,
+          [0] * 4, [0] * 4
+      ], [embedding_weights[0][2], [0] * 4, [0] * 4]])
 
   def test_safe_embedding_lookup_sparse_3d_return_special_vector(self):
     with self.test_session():
@@ -213,14 +215,13 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
           embedding_weights, sparse_ids, None).eval())
 
-      self.assertAllClose(
-          embedding_lookup_result,
-          [[(embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4,
-            [0] * 4], [
-                embedding_weights[0][2],
-                (embedding_weights[0][0] + embedding_weights[0][1]) / 2.0,
-                [0] * 4
-            ]])
+      self.assertAllClose(embedding_lookup_result, [[(
+          embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4, [
+              0
+          ] * 4], [
+              embedding_weights[0][2],
+              (embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4
+          ]])
 
   def test_safe_embedding_lookup_sparse_3d_partitioned(self):
     with self.test_session():
@@ -231,13 +232,12 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
           embedding_weights, sparse_ids, None).eval())
 
       embedding_weights = list(itertools.chain(*embedding_weights))
-      self.assertAllClose(embedding_lookup_result,
-                          [[(embedding_weights[0] + embedding_weights[1]) / 2.0,
-                            [0] * 4, [0] * 4], [
-                                embedding_weights[2],
-                                (embedding_weights[0] + embedding_weights[1]) /
-                                2.0, [0] * 4
-                            ]])
+      self.assertAllClose(embedding_lookup_result, [[
+          (embedding_weights[0] + embedding_weights[1]) / 2.0, [0] * 4, [0] * 4
+      ], [
+          embedding_weights[2],
+          (embedding_weights[0] + embedding_weights[1]) / 2.0, [0] * 4
+      ]])
 
   def test_safe_embedding_lookup_sparse_3d_partitioned_inconsistent_weights(
       self):
@@ -249,8 +249,8 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       self.assertRaises(ValueError, embedding_ops.safe_embedding_lookup_sparse,
                         embedding_weights, sparse_ids)
       embedding_weights = [
-          constant_op.constant(
-              w, dtype=dtypes.float64) for w in embedding_weights
+          constant_op.constant(w, dtype=dtypes.float64)
+          for w in embedding_weights
       ]
       self.assertRaises(ValueError, embedding_ops.safe_embedding_lookup_sparse,
                         embedding_weights, sparse_ids, sparse_weights)
@@ -299,8 +299,8 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
       self.assertAllEqual(embedding_lookup_result[0],
                           embedding_lookup_result[1])
       # Different embedding expected for different value.
-      embedding_diff = np.min((embedding_lookup_result[2] -
-                               embedding_lookup_result[0])**2)
+      embedding_diff = np.min(
+          (embedding_lookup_result[2] - embedding_lookup_result[0])**2)
       self.assertGreater(embedding_diff, 0)
 
   def test_scattered_embedding_coverage(self):
@@ -318,8 +318,8 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
   def test_scattered_embedding_multi_dimension(self):
     with self.test_session():
       embedding_weights = self._random_weights()
-      values = constant_op.constant(
-          [["foo", "bar", "bar"], ["bar", "bar", "foo"]])
+      values = constant_op.constant([["foo", "bar", "bar"],
+                                     ["bar", "bar", "foo"]])
 
       embedding_lookup_result = embedding_ops.scattered_embedding_lookup(
           embedding_weights, values, dimension=10).eval()
@@ -338,8 +338,8 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
 
       embedding_lookup_result = (
           embedding_ops.scattered_embedding_lookup_sparse(
-              embedding_weights, sparse_tensor, dimension=5, combiner="mean")
-          .eval())
+              embedding_weights, sparse_tensor, dimension=5,
+              combiner="mean").eval())
 
       self.assertAllEqual(embedding_lookup_result.shape, [5, 5])
       # Same non-zero embedding for the empty rows filled with a default value.
@@ -431,8 +431,8 @@ class SampledScatteredEmbeddingLookupTest(test.TestCase):
   def test_hashed_embedding_multi_dimension(self):
     with self.test_session():
       embedding_weights = self._random_weights()
-      values = constant_op.constant(
-          [["foo", "bar", "bar"], ["bar", "bar", "foo"]])
+      values = constant_op.constant([["foo", "bar", "bar"],
+                                     ["bar", "bar", "foo"]])
       sampled_candidates = constant_op.constant(
           [[[1, 3, 4, 6], [1, 7, 8, 9], [1, 7, 8, 9]],
            [[1, 7, 8, 9], [1, 7, 8, 9], [1, 3, 4, 6]]])
@@ -489,8 +489,8 @@ class SampledScatteredEmbeddingLookupSparseTest(test.TestCase):
       result = embedding_ops._sampled_scattered_embedding_lookup_sparse(
           params, sp_values, dimension=5, hash_key=self._hash_key)
 
-      self.assertAllClose(result.eval(), [[0., 0., 0., 0., 0.],
-                                          [.3, .2, .2, .3, .1],
+      self.assertAllClose(result.eval(), [[0., 0., 0., 0.,
+                                           0.], [.3, .2, .2, .3, .1],
                                           [0., 0., 0., 0., 0.]])
 
   def test_output_values_with_sampled_candidates(self):
@@ -563,5 +563,224 @@ class SampledScatteredEmbeddingLookupSparseTest(test.TestCase):
       self.assertAllClose(result.eval(), result_abc.eval())
 
 
+def _PName(param_id):
+  return "p" + str(param_id)
+
+
+def _EmbeddingParams(num_shards,
+                     vocab_size,
+                     dtype=dtypes.float32,
+                     shape=None,
+                     use_shapeless_placeholder=False):
+  p = []
+  params = {}
+  feed_dict = {}
+  if not shape:
+    shape = [10]
+  for i in range(num_shards):
+    shard_shape = [vocab_size // num_shards] + shape
+    if i < vocab_size % num_shards:  # Excess goes evenly on the first shards
+      shard_shape[0] += 1
+
+    param_name = _PName(i)
+
+    if use_shapeless_placeholder:
+      param = array_ops.placeholder(dtype, shape=None, name=param_name)
+    else:
+      param = constant_op.constant(
+          1.0, shape=shard_shape, dtype=dtype, name=param_name)
+    p.append(param)
+    np_type = "f" if dtype == dtypes.float32 else "d"
+    val = (np.random.rand(*shard_shape).astype(np_type)) + 1
+    params[param_name + ":0"] = val
+    feed_dict[param.name] = val
+  return p, params, feed_dict
+
+
+def _EmbeddingResult(params,
+                     id_vals,
+                     num_shards,
+                     vocab_size,
+                     partition_strategy="mod",
+                     weight_vals=None):
+  if weight_vals is None:
+    weight_vals = np.copy(id_vals)
+    weight_vals.fill(1)
+  values = []
+  weights = []
+  weights_squared = []
+  for ids, wts in zip(id_vals, weight_vals):
+    value_aggregation = None
+    weight_aggregation = None
+    squared_weight_aggregation = None
+    if isinstance(ids, compat.integral_types):
+      ids = [ids]
+      wts = [wts]
+    for i, weight_value in zip(ids, wts):
+      if partition_strategy == "mod":
+        val = np.copy(params[_PName(i % num_shards) + ":0"][
+            i // num_shards, :]) * weight_value
+      elif partition_strategy == "div":
+        ids_per_partition, extras = divmod(vocab_size, num_shards)
+        threshold = extras * (ids_per_partition + 1)
+        if i < threshold:
+          partition = i // (ids_per_partition + 1)
+          offset = i % (ids_per_partition + 1)
+        else:
+          partition = extras + (i - threshold) // ids_per_partition
+          offset = (i - threshold) % ids_per_partition
+        val = np.copy(
+            params[_PName(partition) + ":0"][offset, :]) * weight_value
+      else:
+        assert False
+      if value_aggregation is None:
+        assert weight_aggregation is None
+        assert squared_weight_aggregation is None
+        value_aggregation = val
+        weight_aggregation = weight_value
+        squared_weight_aggregation = weight_value * weight_value
+      else:
+        assert weight_aggregation is not None
+        assert squared_weight_aggregation is not None
+        value_aggregation += val
+        weight_aggregation += weight_value
+        squared_weight_aggregation += weight_value * weight_value
+    values.append(value_aggregation)
+    weights.append(weight_aggregation)
+    weights_squared.append(squared_weight_aggregation)
+  values = np.array(values).astype(np.float32)
+  weights = np.array(weights).astype(np.float32)
+  weights_squared = np.array(weights_squared).astype(np.float32)
+  return values, weights, weights_squared
+
+
+class EmbeddingLookupSparseWithDistributedAggregationTest(test.TestCase):
+
+  def _RandomIdsAndWeights(self, batch_size, vocab_size):
+    max_val_per_entry = 6
+    vals_per_batch_entry = np.random.randint(
+        1, max_val_per_entry, size=batch_size)
+    num_vals = np.sum(vals_per_batch_entry)
+
+    ids = np.random.randint(vocab_size, size=num_vals)
+    weights = 1 + np.random.rand(num_vals)
+
+    indices = []
+    for batch_entry, num_val in enumerate(vals_per_batch_entry):
+      for val_index in range(num_val):
+        indices.append([batch_entry, val_index])
+
+    shape = [batch_size, max_val_per_entry]
+
+    sp_ids = sparse_tensor_lib.SparseTensor(
+        constant_op.constant(indices, dtypes.int64),
+        constant_op.constant(ids, dtypes.int32),
+        constant_op.constant(shape, dtypes.int64))
+    sp_weights = sparse_tensor_lib.SparseTensor(
+        constant_op.constant(indices, dtypes.int64),
+        constant_op.constant(weights, dtypes.float32),
+        constant_op.constant(shape, dtypes.int64))
+
+    return sp_ids, sp_weights, ids, weights, vals_per_batch_entry
+
+  def _GroupByBatchEntry(self, vals, vals_per_batch_entry):
+    grouped_vals = []
+    index = 0
+    for num_val in vals_per_batch_entry:
+      grouped_vals.append(list(vals[index:(index + num_val)]))
+      index += num_val
+    return grouped_vals
+
+  def testEmbeddingLookupSparse(self):
+    vocab_size = 13
+    batch_size = 10
+    param_shape = [2, 5]
+    expected_lookup_result_shape = [None] + param_shape
+
+    sp_ids, sp_weights, ids, weights, vals_per_batch_entry = (
+        self._RandomIdsAndWeights(batch_size, vocab_size))
+
+    grouped_ids = self._GroupByBatchEntry(ids, vals_per_batch_entry)
+    grouped_weights = self._GroupByBatchEntry(weights, vals_per_batch_entry)
+    grouped_ignored_weights = self._GroupByBatchEntry(
+        np.ones(np.sum(vals_per_batch_entry)), vals_per_batch_entry)
+
+    for num_shards, combiner, dtype, ignore_weights in itertools.product(
+        [1, 5], ["sum", "mean", "sqrtn"], [dtypes.float32,
+                                           dtypes.float64], [True, False]):
+
+      with self.test_session():
+        p, params, feed_dict = _EmbeddingParams(
+            num_shards, vocab_size, shape=param_shape, dtype=dtype)
+        embedding_sum = \
+            embedding_ops.embedding_lookup_sparse_with_distributed_aggregation(
+                p,
+                sp_ids,
+                None if ignore_weights else sp_weights,
+                combiner=combiner)
+
+        self.assertEqual(embedding_sum.get_shape().as_list(),
+                         expected_lookup_result_shape)
+
+        tf_embedding_sum = embedding_sum.eval(feed_dict=feed_dict)
+
+        np_embedding_sum, np_weight_sum, np_weight_sq_sum = _EmbeddingResult(
+            params,
+            grouped_ids,
+            num_shards,
+            vocab_size,
+            weight_vals=grouped_ignored_weights
+            if ignore_weights else grouped_weights)
+        if combiner == "mean":
+          np_embedding_sum /= np.reshape(np_weight_sum, (batch_size, 1, 1))
+        if combiner == "sqrtn":
+          np_embedding_sum /= np.reshape(
+              np.sqrt(np_weight_sq_sum), (batch_size, 1, 1))
+        self.assertAllClose(np_embedding_sum, tf_embedding_sum)
+
+  def testGradientsEmbeddingLookupSparse(self):
+    vocab_size = 12
+    batch_size = 4
+    param_shape = [2, 3]
+    sp_ids, sp_weights, _, _, _ = (self._RandomIdsAndWeights(
+        batch_size, vocab_size))
+
+    for num_shards, combiner, dtype, ignore_weights in itertools.product(
+        [1, 3], ["sum", "mean", "sqrtn"], [dtypes.float32,
+                                           dtypes.float64], [True, False]):
+      with self.test_session():
+        x, params, _ = _EmbeddingParams(
+            num_shards, vocab_size, shape=param_shape, dtype=dtype)
+
+        y = embedding_ops.embedding_lookup_sparse_with_distributed_aggregation(
+            x,
+            sp_ids,
+            None if ignore_weights else sp_weights,
+            combiner=combiner)
+        x_name = [_PName(i) for i in range(num_shards)]
+        x_init_value = [params[x_n + ":0"] for x_n in x_name]
+        x_shape = [i.shape for i in x_init_value]
+        y_shape = [batch_size] + list(params[_PName(0) + ":0"].shape[1:])
+        err = gradient_checker.compute_gradient_error(
+            x, x_shape, y, y_shape, x_init_value=x_init_value)
+      self.assertLess(err, 1e-5 if dtype == dtypes.float64 else 2e-3)
+
+  def testIncompatibleShapes(self):
+    with self.test_session():
+      x, _, _ = _EmbeddingParams(1, 10, dtype=dtypes.float32)
+      sp_ids = sparse_tensor_lib.SparseTensor(
+          constant_op.constant([[0, 0], [0, 1], [1, 0]], dtypes.int64),
+          constant_op.constant([0, 1, 2], dtypes.int32),
+          constant_op.constant([2, 2], dtypes.int64))
+      sp_weights = sparse_tensor_lib.SparseTensor(
+          constant_op.constant([[0, 0], [0, 1]], dtypes.int64),
+          constant_op.constant([12.0, 5.0], dtypes.float32),
+          constant_op.constant([1, 2], dtypes.int64))
+
+      with self.assertRaises(ValueError):
+        embedding_ops.embedding_lookup_sparse_with_distributed_aggregation(
+            x, sp_ids, sp_weights, combiner="mean")
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index 282c556424..32839b251a 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -791,9 +791,11 @@ def weighted_sparse_column(sparse_id_column,
       weight or value of the corresponding sparse id feature.
     dtype: Type of weights, such as `tf.float32`. Only floating and integer
       weights are supported.
+
   Returns:
     A _WeightedSparseColumn composed of two sparse features: one represents id,
     the other represents weight (value) of the id feature in that example.
+
   Raises:
     ValueError: if dtype is not convertible to float.
   """
diff --git a/tensorflow/contrib/learn/python/learn/README.md b/tensorflow/contrib/learn/python/learn/README.md
index 0aae178e9a..6a7b0ea614 100644
--- a/tensorflow/contrib/learn/python/learn/README.md
+++ b/tensorflow/contrib/learn/python/learn/README.md
@@ -9,7 +9,7 @@ TF Learn is a simplified interface for TensorFlow, to get people started on pred
 
 ### Why *TensorFlow Learn*?
 
-- To smooth the transition from the [scikit-learn](http://scikit-learn.org/stable/) world of one-liner machine learning into the more open world of building different shapes of ML models. You can start by using [fit](../../../../g3doc/api_docs/python/contrib.learn.md#Estimator.fit)/[predict](../../../../g3doc/api_docs/python/contrib.learn.md#Estimator.predict) and slide into TensorFlow APIs as you are getting comfortable.
+- To smooth the transition from the [scikit-learn](http://scikit-learn.org/stable/) world of one-liner machine learning into the more open world of building different shapes of ML models. You can start by using [fit](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/Estimator#fit)/[predict](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/Estimator#predict) and slide into TensorFlow APIs as you are getting comfortable.
 - To provide a set of reference models that will be easy to integrate with existing code.
 
 ## Installation
@@ -43,17 +43,17 @@ Optionally you can install [scikit-learn](http://scikit-learn.org/stable/) and [
 ### Existing Estimator Implementations
 
 -   [`LinearClassifier`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/linear.py)
-    ([docs](../../../../g3doc/api_docs/python/contrib.learn.md#LinearClassifier))
+    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/LinearClassifier))
 -   [`LinearRegressor`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/linear.py)
-    ([docs](../../../../g3doc/api_docs/python/contrib.learn.md#LinearRegressor))
+    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/LinearRegressor))
 -   [`DNNClassifier`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/dnn.py)
-    ([docs](../../../../g3doc/api_docs/python/contrib.learn.md#DNNClassifier))
+    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/DNNClassifier))
 -   [`DNNRegressor`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/dnn.py)
-    ([docs](../../../../g3doc/api_docs/python/contrib.learn.md#DNNRegressor))
+    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/DNNRegressor))
 -   [`DNNLinearCombinedClassifier`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py)
-    ([docs](../../../../g3doc/api_docs/python/contrib.learn.md#DNNLinearCombinedClassifier))
+    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/DNNLinearCombinedClassifier))
 -   [`DNNLinearCombinedRegressor`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py)
-    ([docs](../../../../g3doc/api_docs/python/contrib.learn.md#DNNLinearCombinedRegressor))
+    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/DNNLinearCombinedRegressor))
 -   [`SVM`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/svm.py)
     ([docs](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/g3doc/svm.md))
 -   [`GMM`](https://www.tensorflow.org/code/tensorflow/contrib/factorization/python/ops/gmm.py)
@@ -67,7 +67,7 @@ Below are a few simple examples of the API. For more examples, please see [examp
 
 General tips:
 
--  It's useful to rescale a dataset to 0 mean and unit standard deviation before passing it to an [`Estimator`](../../../../g3doc/api_docs/python/contrib.learn.md#estimators). [Stochastic Gradient Descent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent) doesn't always do the right thing when variable are at very different scales.
+-  It's useful to rescale a dataset to 0 mean and unit standard deviation before passing it to an [`Estimator`](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/Estimator). [Stochastic Gradient Descent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent) doesn't always do the right thing when variable are at very different scales.
 
 -  Categorical variables should be managed before passing input to the estimator.
 
@@ -219,7 +219,7 @@ INFO:tensorflow:Loss for final step: 0.0162506.</pre>
 
 ## Summaries
 
-If you supply a `model_dir` argument to your `Estimator`s, TensorFlow will write summaries for ``loss`` and histograms for variables in this directory. (You can also add custom summaries in your custom model function by calling [Summary](../../../../g3doc/api_docs/python/train.md#summary-operations) operations.)
+If you supply a `model_dir` argument to your `Estimator`s, TensorFlow will write summaries for ``loss`` and histograms for variables in this directory. (You can also add custom summaries in your custom model function by calling [Summary](https://www.tensorflow.org/api_guides/python/summary) operations.)
 
 To view the summaries in TensorBoard, run the following command, where `logdir` is the `model_dir` for your `Estimator`:
 
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_functions.py b/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_functions.py
index b2da9a7cc0..b891bf2301 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_functions.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_functions.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 # pylint: disable=unused-import
 from tensorflow.python.estimator.inputs.queues.feeding_functions import _ArrayFeedFn
 from tensorflow.python.estimator.inputs.queues.feeding_functions import _enqueue_data as enqueue_data
+from tensorflow.python.estimator.inputs.queues.feeding_functions import _GeneratorFeedFn
 from tensorflow.python.estimator.inputs.queues.feeding_functions import _OrderedDictNumpyFeedFn
 from tensorflow.python.estimator.inputs.queues.feeding_functions import _PandasFeedFn
 # pylint: enable=unused-import
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
index 8f8ab3b335..bc7465bbc2 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@@ -200,6 +200,7 @@ class RunConfig(ClusterConfig):
   parameter servers), you probably want to use `learn_runner.EstimatorConfig`
   instead.
   """
+  _USE_DEFAULT = 0
 
   def __init__(self,
                master=None,
@@ -208,7 +209,7 @@ class RunConfig(ClusterConfig):
                gpu_memory_fraction=1,
                tf_random_seed=None,
                save_summary_steps=100,
-               save_checkpoints_secs=600,
+               save_checkpoints_secs=_USE_DEFAULT,
                save_checkpoints_steps=None,
                keep_checkpoint_max=5,
                keep_checkpoint_every_n_hours=10000,
@@ -260,6 +261,11 @@ class RunConfig(ClusterConfig):
     self._tf_random_seed = tf_random_seed
     self._save_summary_steps = save_summary_steps
     self._save_checkpoints_secs = save_checkpoints_secs
+    if save_checkpoints_secs == RunConfig._USE_DEFAULT:
+      if save_checkpoints_steps is None:
+        self._save_checkpoints_secs = 600
+      else:
+        self._save_checkpoints_secs = None
     self._save_checkpoints_steps = save_checkpoints_steps
 
     # TODO(weiho): Remove these after ModelFn refactoring, when users can
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/__init__.py b/tensorflow/contrib/learn/python/learn/learn_io/__init__.py
index 32252cd8e3..4567928358 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/__init__.py
@@ -35,3 +35,4 @@ from tensorflow.contrib.learn.python.learn.learn_io.pandas_io import extract_pan
 from tensorflow.contrib.learn.python.learn.learn_io.pandas_io import extract_pandas_matrix
 from tensorflow.contrib.learn.python.learn.learn_io.pandas_io import HAS_PANDAS
 from tensorflow.contrib.learn.python.learn.learn_io.pandas_io import pandas_input_fn
+from tensorflow.contrib.learn.python.learn.learn_io.generator_io import generator_input_fn
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/generator_io.py b/tensorflow/contrib/learn/python/learn/learn_io/generator_io.py
new file mode 100644
index 0000000000..5859bb6b47
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/learn_io/generator_io.py
@@ -0,0 +1,134 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Methods to allow generator of dict with numpy arrays."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from types import FunctionType, GeneratorType
+from collections import Container
+
+from tensorflow.contrib.learn.python.learn.dataframe.queues import feeding_functions
+
+
+def generator_input_fn(x,
+                       target_key=None,
+                       batch_size=128,
+                       num_epochs=1,
+                       shuffle=True,
+                       queue_capacity=1000,
+                       num_threads=1):
+  """Returns input function that would dicts of numpy arrays
+       yielded from a generator.
+  
+  It is assumed that every dict yielded from the dictionary represents
+  a single sample. The generator should consume a single epoch of the data.
+
+  This returns a function outputting `features` and `target` based on the dict
+  of numpy arrays. The dict `features` has the same keys as an element yielded
+  from x.
+
+  Example:
+    ```python
+    def generator():
+      for index in range(10):
+        yield {'height': np.random.randint(32,36),
+              'age': np.random.randint(18, 80),
+              'label': np.ones(1)}
+
+    with tf.Session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator, target_key="label", batch_size=2, shuffle=False,
+          num_epochs=1)
+    ```
+
+  Args:
+    x: Generator Function, returns a `Generator` that will yield the data
+      in `dict` of numpy arrays
+    target_key: String or Container of Strings, the key or Container of keys of
+      the numpy arrays in x dictionaries to use as target.
+    batch_size: Integer, size of batches to return.
+    num_epochs: Integer, number of epochs to iterate over data. If `None` will
+      run forever.
+    shuffle: Boolean, if True shuffles the queue. Avoid shuffle at prediction
+      time.
+    queue_capacity: Integer, size of queue to accumulate.
+    num_threads: Integer, number of threads used for reading and enqueueing.
+
+  Returns:
+    Function, that returns a feature `dict` with `Tensors` and an optional
+     label `dict` with `Tensors`, or if target_key is `str` label is a `Tensor`
+
+  Raises:
+    TypeError: `x` is not `FunctionType`.
+    TypeError: `x()` is not `GeneratorType`.
+    TypeError: `next(x())` is not `dict`.
+    TypeError: `target_key` is not `str` or `target_key` is not `Container`
+       of `str`.
+    KeyError:  `target_key` not a key or `target_key[index]` not in next(`x()`).
+    KeyError: `key` mismatch between dicts emitted from `x()`
+  """
+  if not isinstance(x, FunctionType):
+    raise TypeError(
+        'x must be generator function; got {}'.format(type(x).__name__))
+  generator = x()
+  if not isinstance(generator, GeneratorType):
+    raise TypeError(
+        'x() must be generator; got {}'.format(type(generator).__name__))
+  data = next(generator)
+  if not isinstance(data, dict):
+    raise TypeError('x() must yield dict; got {}'.format(type(data).__name__))
+  input_keys = sorted(next(x()).keys())
+  if target_key is not None:
+    if isinstance(target_key, str):
+      target_key = [target_key]
+    elif isinstance(target_key, Container):
+      for item in target_key:
+        if not isinstance(item, str):
+          raise TypeError('target_key must be str or Container of str; got {}'.
+                          format(type(item).__name__))
+        if item not in input_keys:
+          raise KeyError(
+              'target_key not in yielded dict. Expected {} keys; got {}'.format(
+                  input_keys, item))
+    else:
+      raise TypeError('target_key must be str or Container of str; got {}'.
+                      format(type(target_key).__name__))
+
+  def _generator_input_fn():
+    """generator input function."""
+    queue = feeding_functions.enqueue_data(
+        x,
+        queue_capacity,
+        shuffle=shuffle,
+        num_threads=num_threads,
+        enqueue_size=batch_size,
+        num_epochs=num_epochs)
+
+    features = (queue.dequeue_many(batch_size)
+                if num_epochs is None else queue.dequeue_up_to(batch_size))
+    if not isinstance(features, list):
+      features = [features]
+    features = dict(zip(input_keys, features))
+    if target_key is not None:
+      if len(target_key) > 1:
+        target = {key: features.pop(key) for key in target_key}
+      else:
+        target = features.pop(target_key[0])
+      return features, target
+    return features
+
+  return _generator_input_fn
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py
new file mode 100644
index 0000000000..bc767ec18b
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py
@@ -0,0 +1,348 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for numpy_io."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+# TODO: #6568 Remove this hack that makes dlopen() not crash.
+if hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'):
+  import ctypes
+
+  sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
+
+import numpy as np
+from tensorflow.contrib.learn.python.learn.learn_io import generator_io
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+from tensorflow.python.training import coordinator
+from tensorflow.python.training import queue_runner_impl
+
+
+class GeneratorIoTest(test.TestCase):
+
+  def testGeneratorInputFn(self):
+
+    def generator():
+      for index in range(2):
+        yield {
+            'a': np.ones(1) * index,
+            'b': np.ones(1) * index + 32,
+            'label': np.ones(1) * index - 32
+        }
+
+    with self.test_session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator,
+          target_key='label',
+          batch_size=2,
+          shuffle=False,
+          num_epochs=1)
+      features, target = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], np.asarray([0, 1]).reshape(-1, 1))
+      self.assertAllEqual(res[0]['b'], np.asarray([32, 33]).reshape(-1, 1))
+      self.assertAllEqual(res[1], np.asarray([-32, -31]).reshape(-1, 1))
+
+      session.run([features])
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features, target])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testGeneratorSingleInputFn(self):
+
+    def generator():
+      for index in range(2):
+        yield {'a': np.ones(1) * index}
+
+    with self.test_session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator, target_key=None, batch_size=2, shuffle=False, num_epochs=1)
+      features = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run([features])
+      self.assertAllEqual(res[0]['a'], np.asarray([0, 1]).reshape(-1, 1))
+
+      session.run([features])
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testGeneratorInputFnLabelDict(self):
+
+    def generator():
+      for index in range(2):
+        yield {
+            'a': np.ones(1) * index,
+            'b': np.ones(1) * index + 32,
+            'label': np.ones(1) * index - 32,
+            'label2': np.ones(1) * index - 64,
+        }
+
+    with self.test_session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator,
+          target_key=['label', 'label2'],
+          batch_size=2,
+          shuffle=False,
+          num_epochs=1)
+      features, target = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'], np.asarray([0, 1]).reshape(-1, 1))
+      self.assertAllEqual(res[0]['b'], np.asarray([32, 33]).reshape(-1, 1))
+      self.assertAllEqual(res[1]['label'], np.asarray([-32, -31]).reshape(
+          -1, 1))
+      self.assertAllEqual(res[1]['label2'],
+                          np.asarray([-64, -63]).reshape(-1, 1))
+
+      session.run([features])
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features, target])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testGeneratorInputFnWithDifferentDimensionsOfFeatures(self):
+
+    def generator():
+      for index in range(100):
+        yield {
+            'a': np.ones((10, 10)) * index,
+            'b': np.ones((5, 5)) * index + 32,
+            'label': np.ones((3, 3)) * index - 32
+        }
+
+    with self.test_session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator,
+          target_key='label',
+          batch_size=2,
+          shuffle=False,
+          num_epochs=1)
+      features, target = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run([features, target])
+      self.assertAllEqual(res[0]['a'],
+                          np.vstack((np.zeros((10, 10)), np.ones(
+                              (10, 10)))).reshape(2, 10, 10))
+      self.assertAllEqual(res[0]['b'],
+                          np.vstack((np.zeros((5, 5)), np.ones(
+                              (5, 5)))).reshape(2, 5, 5) + 32)
+      self.assertAllEqual(res[1],
+                          np.vstack((np.zeros((3, 3)), np.ones(
+                              (3, 3)))).reshape(2, 3, 3) - 32)
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testGeneratorInputFnWithXAsNonGeneratorFunction(self):
+    x = np.arange(32, 36)
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, 'x must be generator function'):
+        failing_input_fn = generator_io.generator_input_fn(
+            x, batch_size=2, shuffle=False, num_epochs=1)
+        failing_input_fn()
+
+  def testGeneratorInputFnWithXAsNonGenerator(self):
+
+    def generator():
+      return np.arange(32, 36)
+
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, 'x\(\) must be generator'):
+        failing_input_fn = generator_io.generator_input_fn(
+            generator, batch_size=2, shuffle=False, num_epochs=1)
+        failing_input_fn()
+
+  def testGeneratorInputFnWithXAsNonGeneratorYieldingDicts(self):
+
+    def generator():
+      yield np.arange(32, 36)
+
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, 'x\(\) must yield dict'):
+        failing_input_fn = generator_io.generator_input_fn(
+            generator, batch_size=2, shuffle=False, num_epochs=1)
+        failing_input_fn()
+
+  def testGeneratorInputFNWithTargetLabelNotString(self):
+
+    def generator():
+      for index in range(2):
+        yield {
+            'a': np.ones((10, 10)) * index,
+            'b': np.ones((5, 5)) * index + 32,
+            'label': np.ones((3, 3)) * index - 32
+        }
+
+    y = np.arange(32, 36)
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, 'target_key must be str or'
+                                   ' Container of str'):
+        failing_input_fn = generator_io.generator_input_fn(
+            generator, target_key=y, batch_size=2, shuffle=False, num_epochs=1)
+        failing_input_fn()
+
+  def testGeneratorInputFNWithTargetLabelListNotString(self):
+
+    def generator():
+      for index in range(2):
+        yield {
+            'a': np.ones((10, 10)) * index,
+            'b': np.ones((5, 5)) * index + 32,
+            'label': np.ones((3, 3)) * index - 32
+        }
+
+    y = ['label', np.arange(10)]
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, 'target_key must be str or'
+                                   ' Container of str'):
+        failing_input_fn = generator_io.generator_input_fn(
+            generator, target_key=y, batch_size=2, shuffle=False, num_epochs=1)
+        failing_input_fn()
+
+  def testGeneratorInputFNWithTargetLabelNotInDict(self):
+
+    def generator():
+      for index in range(2):
+        yield {
+            'a': np.ones((10, 10)) * index,
+            'b': np.ones((5, 5)) * index + 32,
+            'label': np.ones((3, 3)) * index - 32
+        }
+
+    y = ['label', 'target']
+    with self.test_session():
+      with self.assertRaisesRegexp(KeyError, 'target_key not in yielded dict'):
+        failing_input_fn = generator_io.generator_input_fn(
+            generator, target_key=y, batch_size=2, shuffle=False, num_epochs=1)
+        failing_input_fn()
+
+  def testGeneratorInputFnWithNoTargetKey(self):
+
+    def generator():
+      for index in range(2):
+        yield {
+            'a': np.ones(1) * index,
+            'b': np.ones(1) * index + 32,
+            'label': np.ones(1) * index - 32
+        }
+
+    with self.test_session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator, target_key=None, batch_size=2, shuffle=False, num_epochs=1)
+      features = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run(features)
+      self.assertAllEqual(res['a'], np.asarray([0, 1]).reshape(-1, 1))
+      self.assertAllEqual(res['b'], np.asarray([32, 33]).reshape(-1, 1))
+      self.assertAllEqual(res['label'], np.asarray([-32, -31]).reshape(-1, 1))
+
+      session.run([features])
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testGeneratorInputFnWithBatchLargerthanData(self):
+
+    def generator():
+      for index in range(2):
+        yield {
+            'a': np.ones(1) * index,
+            'b': np.ones(1) * index + 32,
+            'label': np.ones(1) * index - 32
+        }
+
+    with self.test_session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator, target_key=None, batch_size=4, shuffle=False, num_epochs=1)
+      features = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      res = session.run(features)
+      self.assertAllEqual(res['a'], np.asarray([0, 1, 0, 1]).reshape(-1, 1))
+      self.assertAllEqual(res['b'], np.asarray([32, 33, 32, 33]).reshape(-1, 1))
+      self.assertAllEqual(res['label'],
+                          np.asarray([-32, -31, -32, -31]).reshape(-1, 1))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testGeneratorInputFnWithMismatchinGeneratorKeys(self):
+
+    def generator():
+      index = 0
+      yield {
+          'a': np.ones(1) * index,
+          'b': np.ones(1) * index + 32,
+          'label': np.ones(1) * index - 32
+      }
+      index = 1
+      yield {
+          'a': np.ones(1) * index,
+          'c': np.ones(1) * index + 32,
+          'label': np.ones(1) * index - 32
+      }
+
+    with self.test_session() as session:
+      input_fn = generator_io.generator_input_fn(
+          generator, target_key=None, batch_size=2, shuffle=False, num_epochs=1)
+      features = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features])
+
+      with self.assertRaisesRegex(KeyError, 'key mismatch between dicts emitted'
+                                  ' by GenFunExpected'):
+        coord.request_stop()
+        coord.join(threads)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index 4db818a3a9..2b2e885689 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -370,6 +370,7 @@ ifeq ($(TARGET),IOS)
 	ifeq ($(IOS_ARCH),I386)
 		CXXFLAGS += -mios-simulator-version-min=$(MIN_SDK_VERSION) \
 		-arch i386 \
+		-mno-sse \
 		-fembed-bitcode \
 		-D__thread= \
 		-DUSE_GEMM_FOR_CONV \
diff --git a/tensorflow/contrib/makefile/README.md b/tensorflow/contrib/makefile/README.md
index ac10dfc722..f061b58775 100644
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@@ -75,7 +75,7 @@ To run the executable, use:
 
 ```bash
 tensorflow/contrib/makefile/gen/bin/benchmark \
- --graph=~/graphs/inception/tensorflow_inception_graph.pb
+ --graph=$HOME/graphs/inception/tensorflow_inception_graph.pb
 ```
 
 ## Android
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer.py b/tensorflow/contrib/opt/python/training/external_optimizer.py
index ff80167ff4..0909760b38 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer.py
@@ -99,8 +99,13 @@ class ExternalOptimizerInterface(object):
         slice(start, end) for start, end in zip(accumulated_dims[:-1],
                                                 accumulated_dims[1:])]
 
-  def minimize(self, session=None, feed_dict=None, fetches=None,
-               step_callback=None, loss_callback=None):
+  def minimize(self,
+               session=None,
+               feed_dict=None,
+               fetches=None,
+               step_callback=None,
+               loss_callback=None,
+               **run_kwargs):
     """Minimize a scalar `Tensor`.
 
     Variables subject to optimization are updated in-place at the end of
@@ -120,6 +125,7 @@ class ExternalOptimizerInterface(object):
         flattened into a single vector.
       loss_callback: A function to be called every time the loss and gradients
         are computed, with evaluated fetches supplied as positional arguments.
+      **run_kwargs: kwargs to pass to `session.run`.
     """
     session = session or ops.get_default_session()
     feed_dict = feed_dict or {}
@@ -160,8 +166,10 @@ class ExternalOptimizerInterface(object):
                 for packing_slice in self._packing_slices]
 
     # Set optimization variables to their new values.
-    session.run(self._var_updates,
-                feed_dict=dict(zip(self._update_placeholders, var_vals)))
+    session.run(
+        self._var_updates,
+        feed_dict=dict(zip(self._update_placeholders, var_vals)),
+        **run_kwargs)
 
   def _minimize(self, initial_val, loss_grad_func, equality_funcs,
                 equality_grad_funcs, inequality_funcs, inequality_grad_funcs,
diff --git a/tensorflow/contrib/rnn/ops/lstm_ops.cc b/tensorflow/contrib/rnn/ops/lstm_ops.cc
index 2de40825c9..699cc6c88a 100644
--- a/tensorflow/contrib/rnn/ops/lstm_ops.cc
+++ b/tensorflow/contrib/rnn/ops/lstm_ops.cc
@@ -78,7 +78,7 @@ ci = tanh(ci)
 cs = ci .* i + cs_prev .* f
 cs = clip(cs, cell_clip)
 
-o = sigmoid(cs * wco + f)
+o = sigmoid(cs * wco + o)
 co = tanh(cs)
 h = co .* o
 ```
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index b55e1ff848..d01d375119 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -322,9 +322,10 @@ class BahdanauAttention(_BaseAttentionMechanism):
     Args:
       query: Tensor of dtype matching `self.values` and shape
         `[batch_size, query_depth]`.
+
     Returns:
       score: Tensor of dtype matching `self.values` and shape
-        `[batch_size, self.num_units]`.
+        `[batch_size, max_time]` (`max_time` is memory's `max_time`).
     """
     with variable_scope.variable_scope(None, "bahdanau_attention", [query]):
       processed_query = self.query_layer(query) if self.query_layer else query
@@ -522,7 +523,8 @@ class AttentionWrapper(core_rnn_cell.RNNCell):
     - Step 5: Calculate the context vector as the inner product between the
       alignments and the attention_mechanism's values (memory).
     - Step 6: Calculate the attention output by concatenating the cell output
-      and context through the attention layer.
+      and context through the attention layer (a linear layer with
+      `attention_size` outputs).
 
     Args:
       inputs: (Possibly nested tuple of) Tensor, the input at this time step.
@@ -531,10 +533,10 @@ class AttentionWrapper(core_rnn_cell.RNNCell):
       scope: Must be `None`.
 
     Returns:
-      A tuple `(attention, next_state)`, where:
+      A tuple `(attention_or_cell_output, next_state)`, where:
 
-      - `attention` is the attention passed to the layer above.
-      - `next_state` is an instance of `AttentionWrapperState`
+      - `attention_or_cell_output` depending on `output_attention`.
+      - `next_state` is an instance of `DynamicAttentionWrapperState`
          containing the state calculated at this time step.
 
     Raises:
diff --git a/tensorflow/contrib/seq2seq/python/ops/loss.py b/tensorflow/contrib/seq2seq/python/ops/loss.py
index cfe6ac5134..39a6d2f58b 100644
--- a/tensorflow/contrib/seq2seq/python/ops/loss.py
+++ b/tensorflow/contrib/seq2seq/python/ops/loss.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Seq2seq loss operations for use in sequence models.
 """
 
@@ -28,22 +27,33 @@ from tensorflow.python.ops import nn_ops
 __all__ = ["sequence_loss"]
 
 
-def sequence_loss(logits, targets, weights,
-                  average_across_timesteps=True, average_across_batch=True,
-                  softmax_loss_function=None, name=None):
-  """Weighted cross-entropy loss for a sequence of logits (per example).
+def sequence_loss(logits,
+                  targets,
+                  weights,
+                  average_across_timesteps=True,
+                  average_across_batch=True,
+                  softmax_loss_function=None,
+                  name=None):
+  """Weighted cross-entropy loss for a sequence of logits.
+
+  Depending on the values of `average_across_timesteps` and
+  `average_across_batch`, the return Tensor will have rank 0, 1, or 2 as these
+  arguments reduce the cross-entropy at each target, which has shape
+  `[batch_size, sequence_length]`, over their respective dimensions. For
+  example, if `average_across_timesteps` is `True` and `average_across_batch`
+  is `False`, then the return Tensor will have shape `[batch_size]`.
 
   Args:
-    logits: A 3D Tensor of shape
-      [batch_size x sequence_length x num_decoder_symbols] and dtype float.
+    logits: A Tensor of shape
+      `[batch_size, sequence_length, num_decoder_symbols]` and dtype float.
       The logits correspond to the prediction across all classes at each
       timestep.
-    targets: A 2D Tensor of shape [batch_size x sequence_length] and dtype
+    targets: A Tensor of shape `[batch_size, sequence_length]` and dtype
       int. The target represents the true class at each timestep.
-    weights: A 2D Tensor of shape [batch_size x sequence_length] and dtype
-      float. Weights constitutes the weighting of each prediction in the
-      sequence. When using weights as masking set all valid timesteps to 1 and
-      all padded timesteps to 0.
+    weights: A Tensor of shape `[batch_size, sequence_length]` and dtype
+      float. `weights` constitutes the weighting of each prediction in the
+      sequence. When using `weights` as masking, set all valid timesteps to 1
+      and all padded timesteps to 0, e.g. a mask returned by `tf.sequence_mask`.
     average_across_timesteps: If set, sum the cost across the sequence
       dimension and divide the cost by the total label weight across timesteps.
     average_across_batch: If set, sum the cost across the batch dimension and
@@ -55,7 +65,10 @@ def sequence_loss(logits, targets, weights,
     name: Optional name for this operation, defaults to "sequence_loss".
 
   Returns:
-    A scalar float Tensor: The average log-perplexity per symbol (weighted).
+    A float Tensor of rank 0, 1, or 2 depending on the
+    `average_across_timesteps` and `average_across_batch` arguments. By default,
+    it has rank 0 (scalar) and is the weighted average cross-entropy
+    (log-perplexity) per symbol.
 
   Raises:
     ValueError: logits does not have 3 dimensions or targets does not have 2
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 4aa39e5202..ba761cd7c6 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -721,7 +721,8 @@ cc_library(
         "//tensorflow/core/kernels:quantized_ops",
     ]) + if_mkl([
         "//tensorflow/core/kernels:mkl_conv_op",
-        "//tensorflow/core/kernels:mkl_matmul_op",
+        "//tensorflow/core/kernels:mkl_pooling_ops",
+        "//tensorflow/core/kernels:mkl_relu_op",
         "//tensorflow/core/kernels:mkl_tfconv_op",
     ]),
 )
@@ -2094,7 +2095,8 @@ tf_cc_test_mkl(
         "//tensorflow/cc:scope",
         "//tensorflow/cc:sendrecv_ops",
         "//tensorflow/core/kernels:mkl_conv_op",
-        "//tensorflow/core/kernels:mkl_matmul_op",
+        "//tensorflow/core/kernels:mkl_pooling_ops",
+        "//tensorflow/core/kernels:mkl_relu_op",
         "//tensorflow/core/kernels:mkl_tfconv_op",
         "//tensorflow/core/kernels:ops_util",
         "//third_party/eigen3",
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 9a2e4bcfa0..309c4cd774 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -15,13 +15,15 @@ limitations under the License.
 
 #ifdef INTEL_MKL
 
+#include <algorithm>
 #include <functional>
 #include <memory>
+#include <queue>
+#include <set>
 #include <string>
 #include <unordered_set>
 #include <utility>
 #include <vector>
-
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -39,68 +41,91 @@ limitations under the License.
 
 namespace tensorflow {
 
-// This pass implements rewriting of graph for propagating Mkl
-// layout as an additional output tensor (we will loosely call a
-// tensor that carries Mkl layout as Mkl tensor henceforth.)
-// from every Mkl supported NN layer.
+// This pass implements rewriting of graph to support following scenarios:
+// (A) Merging nodes in the graph
+// (B) Rewriting a node in the graph to a new node
+//     Rewrite happens under following 2 scenarios:
+//     1) Propagating Mkl layout as an additional output tensor
+//        (we will loosely call a tensor that carries Mkl layout as Mkl tensor
+//         henceforth.) from every Mkl supported NN layer.
+//     2) Context-based rewrite: This is neded in order to optimize
+//        gradient ops of Conv2D+AddBias. Gradient op of both the Conv2D and
+//        MatMul is BiasAddGrad, and we need to rewrite BiasAddGrad into
+//        Conv2D-specific BiasAddGrad, and MatMul-specific BiasAddGrad.
+//        This is context-specific optimization, where the context is the
+//        forward operator that the BiasAddGrad corresponds to.
+//
+// Example of A : Merging nodes in the graph
+// -----------------------------------------
+// Currently, we merge Conv2D+AddBias together. Consider Conv2D and BiasAdd as:
+//
+//           O = Conv2D(A, B)
+//           P = BiasAdd(O, C)
+//
+// We merge them into Conv2DWithBias as:
+//           P = MklConv2DWithBias(A, A_m, B, B_m, C, C_m)
 //
-// As a example, consider Relu layer. Current definition of Relu
-// layer looks like:
+// Meaning of A_m, B_m and C_m is explained in B.1.
+//
+// Merge rules:
+//  - Merge for Conv2D and BiasAdd happens only when output of Conv2D _only_
+//    goes to BiasAdd.
+//  - Also, the intersection of attributes of both the nodes must have same
+//    values.
+//  - Both the nodes must have been assigned to same device (if any).
+//
+// Example of B.1 : Rewriting nodes to Mkl nodes
+// ---------------------------------------------
+// Consider Relu layer. Current definition of Relu layer looks like:
 //
 //           O = Relu(A)
 //
 // Relu has 1 input (A), and 1 output (O).
 //
-// This rewrite pass will generate a new graph node for Relu
-// (new node is called MklRelu) as:
+// This rewrite pass will generate a new graph node for Relu (new node is
+// called MklRelu) as:
 //
 //          O, O_m = MklRelu(A, A_m)
 //
-// MklRelu has 2 inputs (A and A_m) and 2 outputs (O and O_m).
-// Here A input is same as A input of Relu; O output is same
-// as O output of Relu. O_m is the additional output tensor
-// that will be set by MklRelu, and it represents Mkl tensor
-// corresponding to O -- in other words, O_m is some kind of
-// metadata for O. A_m is additional input of Relu, and it
-// represents metadata for A - as O_m is metadata for O, A_m
-// is metadata for A. MklRelu receives this metadata from
-// previous layer (in the graph).
+// MklRelu has 2 inputs (A and A_m) and 2 outputs (O and O_m). Here A input is
+// same as A input of Relu; O output is same as O output of Relu. O_m is the
+// additional output tensor that will be set by MklRelu, and it represents
+// Mkl tensor corresponding to O -- in other words, O_m is some kind of
+// metadata for O. A_m is additional input of Relu, and it represents metadata
+// for A - as O_m is metadata for O, A_m is metadata for A. MklRelu receives
+// this metadata from previous layer (in the graph).
 //
-// When previous layer in the graph is Mkl layer, A_m will
-// represent a valid Mkl tensor. But when previous Mkl layer
-// is not an Mkl layer, then A_m represents a dummy Mkl tensor.
+// When previous layer in the graph is Mkl layer, A_m will represent a valid
+// Mkl tensor. But when previous Mkl layer is not an Mkl layer, then A_m
+// represents a dummy Mkl tensor.
 //
 // Rewriting rules:
-//   - Selection of an op for rewriting happens by registering
-//     an op with this pass. If an op is not registered, then
-//     it is not rewritten.
+//  - Selection of an op for rewriting happens by registering an op with this
+//     pass. If an op is not registered, then it is not rewritten.
 //  - Number of inputs after rewriting:
-//      Since for every input Tensorflow tensor, the rewritten
-//      layer gets Mkl tensor, rewritten op gets 2*N inputs,
-//      where N is the number of inputs for original op.
+//      Since for every input Tensorflow tensor, the rewritten layer gets Mkl
+//      tensor, rewritten op gets 2*N inputs, where N is the number of inputs
+//      for original op.
 //  - Number of outputs after rewriting:
-//      Since for every output Tensorflow tensor, the rewritten
-//      layer generates Mkl tensor, rewritten op generates 2*N
-//      outputs, where N is the number of outputs of original op.
+//      Since for every output Tensorflow tensor, the rewritten layer generates
+//      Mkl tensor, rewritten op generates 2*N outputs, where N is the number
+//      of outputs of original op.
 //  - Ordering of Tensorflow tensors and Mkl tensors:
-//      Since every op generates twice the number of inputs and
-//      outputs, one could imagine different ordering among
-//      Tensorflow tensors and Mkl tensors. E.g., let's assume
-//      an op 'Conv2D' takes (A, B) as input, then new op
-//      'MklConv2D' can take (A, A_m, B, B_m) as input or it
-//      can also take (A, B, A_m, B_m) as input. Among N inputs
-//      one can get N! permutations.
-//
-//      So the question is: which one do we follow? Currently,
-//      we follow an intuitive order where Mkl tensor follows a
-//      corresponding Tensorflow tensor immediately. In the
-//      context of above example, it will be: (A, A_m, B, B_m).
-//      We follow same ordering rule for output tensors.
-//
-// NOTE: Current rewriting approach rewrites an op to Mkl op without
-//      any conditions. But in the future, it may be possible to
-//      consider conditions such as input shapes and sizes to rewrite
-//      an op.
+//      Since every op generates twice the number of inputs and outputs, one
+//      could imagine different ordering among Tensorflow tensors and Mkl
+//      tensors. E.g., let's assume an op 'Conv2D' takes (A, B) as input, then
+//      new op 'MklConv2D' can take (A, A_m, B, B_m) as input or it can also
+//      take (A, B, A_m, B_m) as input. Among N inputs one can get N!
+//      permutations.
+//
+//      So the question is: which one do we follow? Currently, we follow an
+//      intuitive order where Mkl tensor follows a corresponding Tensorflow
+//      tensor immediately. In the context of above example, it will be: (A,
+//      A_m, B, B_m). We follow same ordering rule for output tensors.
+//
+// NOTE: Current rewriting approach rewrites an op to Mkl op without any
+//      conditions. But in the future, it may be possible to consider
+//      conditions such as input shapes and sizes to rewrite an op.
 //
 // Graph rewrite algorithm:
 //      Algorithm: Graph Rewrite
@@ -147,13 +172,137 @@ namespace tensorflow {
 //        it is, then we rewrite that node after constructing new inputs to
 //        the node. If it is not Mkl layer, then we do not rewrite the node.
 //
+// Handling workspace propagation for certain ops:
+//
+//        Certain backward ops in MKL (MaxPool, LRN and BatchNorm) require
+//        passing of workspace from their corresponding forward ops. But
+//        TensorFlow does not have a notion of workspace and as a result
+//        does not allow producing additional outputs from these forward ops.
+//        For these ops, we need to add an additional edge between forward
+//        ops and their corresponding backward ops, and this edge carries
+//        workspace tensor value and another edge carries Mkl tensor for
+//        workspace tensor.
+//
+//        Example:
+//
+//        Typical graph for MaxPool and its gradient looks like:
+//
+//        A = MaxPool(T)
+//        B = MaxPoolGrad(X, A, Y)
+//
+//        We will transform this graph to propagate workspace as:
+//
+//        A, A_m, W, W_m = MklMaxPool(T, T_m)
+//        B, B_m = MklMaxPoolGrad(X, X_m, A, A_m, Y, Y_m, W, W_m)
+//
+//        Here W is the workspace tensor. Transformed tensors with name
+//        suffix _m are Mkl tensors and this transformation has been done
+//        using the algorithm discussed earlier. The transformation for
+//        workspace only adds extra outputs (W, W_m) for forward op and
+//        connects them to corresponding backward ops.
+//
+//        Terms:
+//
+//        Forward op name = name of the op in the forward pass
+//          where workspace originates (MaxPool in this example)
+//        Backward op name = name of the op in the backward pass that receives
+//          workspace from forward op (MaxPoolGrad in the example)
+//        Slot = Number of the output or input slot that will be
+//               used by the workspace (2 for MklMaxPool as W is 3rd
+//               output of MaxPool (0 is 1st); 6 for MklMaxPoolGrad)
+//
+//        Question:
+//
+//        How do we associate backward op to forward op? There can be more
+//        than one op with exact same name.
+//
+//        In this example we associate MaxPoolGrad with MaxPool. But there
+//        could be more than one MaxPool ops. To solve this problem, we look
+//        for _direct_ edge between forward op and backward op (tensor A is
+//        flowing along this edge in the example.)
+//
+//        How do we transform forward and backward op when there is no direct
+//        edge between them? In such case, we generate dummy tensors as
+//        workspace tensors. For the example, transformation of MaxPool will
+//        be exactly same --- it is just that MaxPool won't generate any
+//        workspace tensor. For MaxPoolGrad, transformation will also be same,
+//        but instead of connecting W and W_m with outputs of MaxPool, we will
+//        produce dummy tensors for them, and we will set workspace_enabled
+//        attribute to false.
+//
+// Example of B.2 : Context-based node rewrite
+// -------------------------------------------
+// Consider BiasAddGrad op as:
+//
+//           O = MklConv2D(A, A_m, B, B_m, C, C_m)
+//           P = BiasAddGrad(O)
+//
+// Then we rewrite is as:
+//
+//           P = Conv2DWithBiasBackpropBias(O, O_m)
+//
+// 'Distance' between input of BiasAddGrad and MklConv2D in terms of hops is
+// the context matching depth. If MklConv2DWithBias is not within the context
+// matching depth, then we do not rewrite BiasAddGrad.
+
+// How many hops do we search for matching node in the backward dataflow graph?
+// We use maxhop of 10 based on empirical observations. Also, these are
+// maxhops in backward data-flow graph. Since input of forward nodes (Conv2D)
+// directly goes to backward nodes, we do not expect the hop-distance
+// would be more than few nodes.
+static size_t kNodeMergeContextMaxDepth = 10;
+
 class MklLayoutRewritePass : public GraphOptimizationPass {
  public:
   MklLayoutRewritePass() {
     csinfo_.conv2d = "Conv2D";
-
-    ninfo_.push_back(
-        {csinfo_.conv2d, GetMklOpName(csinfo_.conv2d), 2, CopyAttrsConv2D});
+    csinfo_.mklconv2d = "MklConv2D";
+    csinfo_.mklconv2dwithbias = "MklConv2DWithBias";
+    csinfo_.mklconv2dwithbiasbackpropbias = "MklConv2DWithBiasBackpropBias";
+    csinfo_.biasadd = "BiasAdd";
+    csinfo_.matmul = "MatMul";
+    csinfo_.biasaddgrad = "BiasAddGrad";
+    csinfo_.relu = "Relu";
+    csinfo_.relugrad = "ReluGrad";
+    csinfo_.maxpool = "MaxPool";
+    csinfo_.maxpoolgrad = "MaxPoolGrad";
+    csinfo_.avgpool = "AvgPool";
+    csinfo_.avgpoolgrad = "AvgPoolGrad";
+    csinfo_.conv2dgradinput = "Conv2DBackpropInput";
+    csinfo_.conv2dgradfilter = "Conv2DBackpropFilter";
+
+    rinfo_.push_back(
+        {csinfo_.conv2d, csinfo_.mklconv2d, 2, CopyAttrsConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv2dgradfilter,
+                      GetMklOpName(csinfo_.conv2dgradfilter), 3,
+                      CopyAttrsConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv2dgradinput,
+                      GetMklOpName(csinfo_.conv2dgradinput), 3, CopyAttrsConv2D,
+                      AlwaysRewrite});
+    rinfo_.push_back({csinfo_.relu, GetMklOpName(csinfo_.relu), 1,
+                      CopyAttrsRelu, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.maxpool, GetMklOpName(csinfo_.maxpool), 1,
+                      CopyAttrsPooling, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.maxpoolgrad, GetMklOpName(csinfo_.maxpoolgrad), 3,
+                      CopyAttrsPooling, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.avgpool, GetMklOpName(csinfo_.avgpool), 1,
+                      CopyAttrsPooling, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.avgpoolgrad, GetMklOpName(csinfo_.avgpoolgrad), 2,
+                      CopyAttrsPooling, AlwaysRewrite});
+
+    // Add info about which ops to add workspace edge to and the slots.
+    wsinfo_.push_back({csinfo_.maxpool, csinfo_.maxpoolgrad, 0, 1, 2, 6});
+
+    // Add a rule for merging nodes
+    minfo_.push_back(
+        {csinfo_.mklconv2d, csinfo_.biasadd, 0, csinfo_.mklconv2dwithbias});
+
+    // We use maxhop of 10 based on empirical observations. Also, these are
+    // maxhops in backward data-flow graph. Since input of forward nodes
+    // (Conv2D) directly goes to backward nodes, we do not expect the
+    // hop-distance would be more than few nodes.
+    cinfo_.push_back({csinfo_.biasaddgrad, csinfo_.mklconv2dwithbias,
+                      kNodeMergeContextMaxDepth});
   }
 
   // Standard interface to run pass
@@ -176,20 +325,79 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string name;     // Original name of the op in the graph
     string newname;  // New name of op in the graph
     int numins;      // Number of inputs to the original op
-    std::function<void(Node*, NodeBuilder*)>
-        copyattrs;  // Function handler
-                    // to copy attributes from old node to new node.
-  } NodesInfo;
+    // Function handler to copy attributes from old node to new node.
+    std::function<void(const Node*, NodeBuilder*)> copyattrs;
+    std::function<bool(const Node*)> rewriterule;  // Rule under which to
+                                                   // rewrite this node.
+  } RewriteInfo;
+
+  /// Structure to specify forward op, backward op, and the slot numbers
+  /// in forward and backward op where we will add workspace edge.
+  typedef struct {
+    string fwdop;   // Name of the forward op in the graph
+    string bwdop;   // Name of the backward op in the graph
+    int fwdslot;    // Output slot in the forward op node where actual
+                    // output tensor resides
+    int bwdslot;    // Input slot in the backward op node where actual
+                    // input tensor resides
+    int wsfwdslot;  // Output slot in the forward op node where workspace
+                    // edge is added
+    int wsbwdslot;  // Input slot in the backward op node where workspace
+                    // edge is added
+  } WorkSpaceInfo;
+
+  /// Structure to specify information used in node merge
+  typedef struct {
+    string pred;     // Predecessor node string
+    string succ;     // Successor node string
+    int op;          // What operand no the predecessor node corresponds
+                     // to successor node?
+    string newnode;  // Name of the node after merge
+  } MergeInfo;
+
+  /// Structure to specify the context information used in node rewrite rule
+  typedef struct {
+    string node;    // Name of the node to be rewritten
+    string fwd;     // Node name in forward pass that this node
+                    // corresponds to
+    size_t maxhop;  // Maximum number of hops the fwd is located
+                    // from this node. If fwd is farther than maxhop
+                    // then we do not rewrite the node.
+  } ContextInfo;
 
   /// Structure to store all constant strings
   struct {
     string relu;
     string relugrad;
+    // Conv ops
     string conv2d;
+    string mklconv2d;
+    string conv2dgradinput;
+    string conv2dgradfilter;
+    string mklconv2dwithbias;
+    string mklconv2dwithbiasbackpropbias;
+    // Pooling ops
+    string maxpool;
+    string maxpoolgrad;
+    string avgpool;
+    string avgpoolgrad;
+    // Others
+    string biasadd;
+    string matmul;
+    string biasaddgrad;
   } csinfo_;
 
   /// Maintain info about nodes to rewrite
-  std::vector<NodesInfo> ninfo_;
+  std::vector<RewriteInfo> rinfo_;
+
+  /// Maintain info about nodes to add workspace edge
+  std::vector<WorkSpaceInfo> wsinfo_;
+
+  /// Maintain info  to be merged
+  std::vector<MergeInfo> minfo_;
+
+  /// Maintain info about nodes to rewrite
+  static std::vector<ContextInfo> cinfo_;
 
   /// Hash table to maintain nodes visited in the graph.
   std::unordered_set<const Node*> visited_nodes_;
@@ -209,6 +417,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // Mark the node as rewritten
   inline void MarkRewrittenNode(Node* n) { visited_nodes_.insert(n); }
 
+  // Clear all visited nodes
+  inline void UnMarkRewrittenNodes() { visited_nodes_.clear(); }
+
   // Get the name of Mkl op from original TensorFlow op
   // We prefix 'Mkl' to the original op to get Mkl op.
   // TODO(nhasabni) We should move this to mkl_util.h.
@@ -218,6 +429,71 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return string(kMklOpPrefix) + name;
   }
 
+  // Return a node that can be merged with input node 'n'
+  //
+  // @return pointer to the node if we can find such a
+  // node. Otherwise, it returns nullptr.
+  Node* CheckForNodeMerge(const Node* n) const;
+
+  // Merge predecessor node with its successor.
+  // Currently, we merge Conv2D with BiasAdd only.
+  //
+  // Input nodes succ and pred may be deleted if the call to
+  // this function is successful. Attempt to use the pointers
+  // after the call to function may result is undefined behaviors.
+  //
+  // @input g - input graph, succ - successor node, pred - predecessor node
+  // @return Status::OK(), if merging is successful and supported.
+  //         Returns appropriate Status error code otherwise.
+  //         Graph is updated in case nodes are merged. Otherwise, it is
+  //         not updated.
+  Status MergeNode(std::unique_ptr<Graph>* g, Node* succ, Node* pred);
+
+  // Check if the node 'n' has any applicable rewrite rule
+  // We check for 2 scenarios for rewrite.
+  //
+  // @return RewriteInfo* for the applicable rewrite rule
+  const RewriteInfo* CheckForNodeRewrite(const Node* n) const;
+
+  // Default rewrite rule to be used in scenario 1 for rewrite.
+  // @return - true (since we want to always rewrite)
+  static bool AlwaysRewrite(const Node* n) { return true; }
+  // Rewrite rule that uses context-information for matching
+  // used in scenario 2.
+  //
+  // @input - Node 'n' for which to search for matching context
+  // @return - true if matching context is found; false otherwise.
+  static bool ContextMatchRewrite(const Node* n);
+
+  // Helper function that searches the matching contextinfo for the node.
+  // Implements depth-first search in the data dependence graph for the
+  // gradient op in the backward direction.
+  //
+  // @input n - Node (gradient op) whose contextinfo is to be searched,
+  //        fwdn - pointer to node from the forward pass that this node
+  //        belongs to. fwdn cannot be NULL.
+  // @return Matching contextinfo in case a match is found; null otherwise.
+  //         Also updates *fwdn with pointer to forward node that this context
+  //         matches.
+  static const ContextInfo* SearchMatchingContext(const Node* n,
+                                                  const Node** fwdn);
+
+  // Rewrites input node to a new node specified by its matching rewrite info.
+  //
+  // Method first searches matching rewrite info for input node and then
+  // uses that info to rewrite.
+  //
+  // Input node may be deleted in case of rewrite. Attempt to use the node
+  // after the call can result in undefined behaviors.
+  //
+  // @input  g - input graph, n - Node to be rewritten,
+  //         ri - matching rewriteinfo
+  // @return Status::OK(), if the input node is rewritten;
+  //         Returns appropriate Status error code otherwise.
+  //         Graph is updated in case the input node is rewritten.
+  //         Otherwise, it is not updated.
+  Status RewriteNode(std::unique_ptr<Graph>* g, Node* n, const RewriteInfo* ri);
+
   // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
   // in graph 'g'. Original node is input in 'orign'.
   //
@@ -230,28 +506,40 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                      const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
                      NodeBuilder* nb, Node* orign);
 
-  // Rewrite Node 'n' in graph 'g' with rewrite information specified in 'ni'
-  // Returns Status::OK() if node rewrite is successful, otherwise returns
-  // appropriate error status
-  Status RewriteNode(std::unique_ptr<Graph>* g, Node* n, const NodesInfo& ni);
+  // Add workspace edge on the input or output side of Node 'orign' by using
+  // NodeBuilder 'nb' for the new node provided. If 'orign' does not dictate
+  // adding workspace edge then do not add it.
+  void AddWorkSpaceEdgeIfNeeded(std::unique_ptr<Graph>* g, Node* orign,
+                                NodeBuilder* nb);
 
   // Functions specific to operators to copy attributes
   // We need operator-specific function to copy attributes because the framework
   // does not provide any generic function for it.
-  static void CopyAttrsConv2D(Node* orign, NodeBuilder* nb);
+  static void CopyAttrsConv2D(const Node* orign, NodeBuilder* nb);
+  static void CopyAttrsBiasAddGrad(const Node* orign, NodeBuilder* nb);
+  static void CopyAttrsPooling(const Node* orign, NodeBuilder* nb);
+  static void CopyAttrsRelu(const Node* orign, NodeBuilder* nb);
 
   // Generate a graph node in graph 'g' representing a dummy Mkl tensor node,
   // using node for original node 'orign' and return it in '*out'.
   // TODO(nhasabni) We should move this to mkl_util.h
   void GetDummyMklTensorNode(std::unique_ptr<Graph>* g, Node** out,
                              Node* orign);
+  void GetDummyWorkspaceTensorNode(std::unique_ptr<Graph>* g, Node** out,
+                                   Node* orign);
 };
 
+std::vector<MklLayoutRewritePass::ContextInfo> MklLayoutRewritePass::cinfo_;
+
 // We register Mkl rewrite pass for phase 1 in pre-placement group.
 // Do not change the ordering of the Mkl passes.
 REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 1,
                       MklLayoutRewritePass);
 
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions for creating new node
+//////////////////////////////////////////////////////////////////////////
+
 static void FillInputs(const Node* n,
                        gtl::InlinedVector<Node*, 4>* control_edges,
                        gtl::InlinedVector<std::pair<Node*, int>, 4>* in) {
@@ -273,47 +561,6 @@ static void FillInputs(const Node* n,
   }
 }
 
-//////////////////////////////////////////////////////////////////////////
-
-// Macros to build new node with different number of inputs.
-// We need this way because we need to specify all the inputs when
-// building a node. Comment at core/graph/node_builder.h, line 85-86.
-
-#define SETUP_INPUTS1(nb, op1)      \
-  do {                              \
-    nb->Input(op1.node, op1.index); \
-  } while (0)
-
-#define SETUP_INPUTS2(nb, op1, op2) \
-  do {                              \
-    nb->Input(op1.node, op1.index); \
-    nb->Input(op2.node, op2.index); \
-  } while (0)
-
-#define SETUP_INPUTS3(nb, op1, op2, op3) \
-  do {                                   \
-    nb->Input(op1.node, op1.index);      \
-    nb->Input(op2.node, op2.index);      \
-    nb->Input(op3.node, op3.index);      \
-  } while (0)
-
-#define SETUP_INPUTS4(nb, op1, op2, op3, op4) \
-  do {                                        \
-    nb->Input(op1.node, op1.index);           \
-    nb->Input(op2.node, op2.index);           \
-    nb->Input(op3.node, op3.index);           \
-    nb->Input(op4.node, op4.index);           \
-  } while (0)
-
-#define SETUP_INPUTS5(nb, op1, op2, op3, op4, op5) \
-  do {                                             \
-    nb->Input(op1.node, op1.index);                \
-    nb->Input(op2.node, op2.index);                \
-    nb->Input(op3.node, op3.index);                \
-    nb->Input(op4.node, op4.index);                \
-    nb->Input(op5.node, op5.index);                \
-  } while (0)
-
 // TODO(nhasabni) We should move this to mkl_util.h.
 void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
                                                  Node** out, Node* orign) {
@@ -335,6 +582,7 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
                                           // device as device of original
                                           // node.
           .Finalize(&**g, out));
+  (*out)->set_assigned_device_name(orign->assigned_device_name());
 }
 
 Status MklLayoutRewritePass::SetUpInputs(
@@ -359,7 +607,7 @@ Status MklLayoutRewritePass::SetUpInputs(
       TF_CHECK_OK(GetNodeAttr(n->def(), "T", &T));
       // If this op has been rewritten, then its name must have been same as
       // Mkl op.
-      CHECK_EQ(mkl_layer_registry::IsMklLayer(n->type_string()), true);
+      CHECK_EQ(mkl_layer_registry::IsMklLayer(n->type_string(), T), true);
       // src slot number for Mkl tensor would be the one next to TF tensor
       // slot number.
       new_inputs.push_back(NodeBuilder::NodeOut(n, inputs[i].second + 1));
@@ -380,38 +628,140 @@ Status MklLayoutRewritePass::SetUpInputs(
   // N for Mkl tensors corresponding to each Tensorflow tensors.
   CHECK_EQ(new_inputs.size(), inputs.size() * 2);
 
-  // 2. Let's build the node with new inputs.
-  switch (new_inputs.size()) {
-    case 0:  // We don't need to do anything for no input as we have
-             // already built node.
-      break;
-    case 1:
-      SETUP_INPUTS1(nb, new_inputs[0]);
-      break;
-    case 2:
-      SETUP_INPUTS2(nb, new_inputs[0], new_inputs[1]);
-      break;
-    case 3:
-      SETUP_INPUTS3(nb, new_inputs[0], new_inputs[1], new_inputs[2]);
-      break;
-    case 4:
-      SETUP_INPUTS4(nb, new_inputs[0], new_inputs[1], new_inputs[2],
-                    new_inputs[3]);
-      break;
-    case 5:
-      SETUP_INPUTS5(nb, new_inputs[0], new_inputs[1], new_inputs[2],
-                    new_inputs[3], new_inputs[4]);
-      break;
-    default: {
-      return Status(error::Code::UNIMPLEMENTED,
-                    "Could not create node with given number of inputs");
-    }
+  // 2. Let's add the new inputs.
+  for (auto ni : new_inputs) {
+    nb->Input(ni.node, ni.index);
   }
 
   return Status::OK();
 }
 
-void MklLayoutRewritePass::CopyAttrsConv2D(Node* orign, NodeBuilder* nb) {
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions related to workspace pass
+//////////////////////////////////////////////////////////////////////////
+
+// TODO(nhasabni) We should move this to mkl_util.h.
+void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
+    std::unique_ptr<Graph>* g, Node** out, Node* orign) {
+  // We use a tensor of shape {1} and value 0 to represent
+  // dummy float tensor. We need this as a dummy workspace tensor.
+  // Workspace tensor has type float.
+  const DataType dt = DataTypeToEnum<float>::v();
+  TensorProto proto;
+  proto.set_dtype(dt);
+  float zero[1] = {0};
+  proto.set_tensor_content(const_cast<const void*>(static_cast<void*>(&zero)),
+                           4);
+  TensorShape dummy_shape({1});
+  dummy_shape.AsProto(proto.mutable_tensor_shape());
+  TF_CHECK_OK(
+      NodeBuilder((*g)->NewName("DMT"), "Const")
+          .Attr("value", proto)
+          .Attr("dtype", dt)
+          .Device(orign->def().device())  // We place this node on same
+                                          // device as device of original
+                                          // node.
+          .Finalize(&**g, out));
+  (*out)->set_assigned_device_name(orign->assigned_device_name());
+}
+
+void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(std::unique_ptr<Graph>* g,
+                                                    Node* orign,
+                                                    NodeBuilder* nb) {
+  bool workspace_edge_added = false;
+  DataType T;
+  TF_CHECK_OK(GetNodeAttr(orign->def(), "T", &T));
+  for (auto ws : wsinfo_) {
+    if (orign->type_string() == ws.fwdop &&
+        mkl_layer_registry::IsMklLayer(GetMklOpName(orign->type_string()), T)) {
+      // If this op is a fwd op, then we need to check if there is an
+      // edge from this node's fwdslot to bwdop's bwdslot. If there is
+      // an edge, then we just add an attribute on this node for setting
+      // workspace_passed to true. We don't add actual workspace edge
+      // in this node. Actual workspace edge gets added in the backward
+      // op for this node.
+      for (const Edge* e : orign->out_edges()) {
+        if (e->src_output() == ws.fwdslot &&
+            e->dst()->type_string() == ws.bwdop &&
+            e->dst_input() == ws.bwdslot) {
+          nb->Attr("workspace_enabled", true);
+          VLOG(1) << "MklLayoutRewritePass: workspace_enabled for "
+                  << orign->type_string();
+          workspace_edge_added = true;
+          // We found the edge that we were looking for, so break.
+          break;
+        }
+      }
+
+      if (!workspace_edge_added) {
+        // If we are here, then we did not find backward operator for this
+        // node.
+        nb->Attr("workspace_enabled", false);
+      }
+    } else if (orign->type_string() == ws.bwdop &&
+               mkl_layer_registry::IsMklLayer(
+                   GetMklOpName(orign->type_string()), T)) {
+      // If this op is a bwd op, then we need to add workspace edge and
+      // it's Mkl tensor edge between its corresponding fwd op and this
+      // op. Corresponding fwd op is specified in 'fwdop' field of
+      // workspace info. fwdslot and bwdslot in workspace info specify
+      // an edge between which slots connect forward and backward op.
+      // Once all these criteria match, we add a workspace edge between
+      // wsfwdslot and wsbwdslot. It's corresponding Mkl tensor is added
+      // in wsfwdslot+1 and wsbwdslot+1.
+      for (const Edge* e : orign->in_edges()) {
+        if (e->src_output() == ws.fwdslot &&
+            // We would have rewritten the forward op, so we need to use
+            // GetMklOpName call to get its Mkl name.
+            e->src()->type_string() == GetMklOpName(ws.fwdop) &&
+            e->dst_input() == ws.bwdslot) {
+          nb->Attr("workspace_enabled", true);
+          // Add workspace edge between fwd op and bwd op.
+          nb->Input(e->src(), ws.wsfwdslot);
+          // Add Mkl tensor edge for workspace edge between fwd op and bwd op.
+          nb->Input(e->src(), ws.wsfwdslot + 1);
+          // In terms of input ordering, we add these calls to add Input
+          // here because workspace edge (and its Mkl tensor) is the last
+          // edge in the fwdop and bwdop. So all inputs before workspace
+          // tensor have been added by SetUpInputs function.
+          VLOG(1) << "MklLayoutRewritePass: workspace_enabled for "
+                  << orign->type_string();
+          workspace_edge_added = true;
+          // We found the edge that we were looking for, so break.
+          break;
+        }
+      }
+
+      // If we are here means we did not find fwd op that feeds to this
+      // bwd op. So in this case, we need to generate dummy tensors for
+      // workspace input and Mkl tensor for workspace, and set
+      // workspace_enabled to false.
+      if (!workspace_edge_added) {
+        nb->Attr("workspace_enabled", false);
+        Node* dmt_ws = nullptr;      // Dummy tensor for workspace
+        Node* dmt_mkl_ws = nullptr;  // Dummy Mkl tensor for workspace
+        GetDummyWorkspaceTensorNode(g, &dmt_ws, orign);
+        GetDummyMklTensorNode(g, &dmt_mkl_ws, orign);
+        CHECK_NOTNULL(dmt_ws);
+        CHECK_NOTNULL(dmt_mkl_ws);
+        nb->Input(dmt_ws, 0);      // We add dummy tensor as workspace tensor.
+        nb->Input(dmt_mkl_ws, 0);  // We add dummy tensor as Mkl
+                                   // tensor for workspace tensor.
+        VLOG(1) << "MklLayoutRewritePass: dummy workspace_enabled for "
+                << orign->type_string();
+      }
+    } else {
+      // If this node does not match any workspace info, then we do not
+      // do anything special for workspace propagation for it.
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Op-specific functions to copy attributes from old node to new node
+//////////////////////////////////////////////////////////////////////////
+
+void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orign, NodeBuilder* nb) {
   DataType T;
   string data_format;
   string padding;
@@ -433,19 +783,280 @@ void MklLayoutRewritePass::CopyAttrsConv2D(Node* orign, NodeBuilder* nb) {
   nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
 }
 
+void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orign,
+                                                NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  std::vector<int32> strides;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orign->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orign->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orign->def(), "data_format", &data_format));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("strides", strides);
+  nb->Attr("data_format", data_format);
+}
+
+void MklLayoutRewritePass::CopyAttrsPooling(const Node* orign,
+                                            NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> ksize, strides;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orign->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orign->def(), "ksize", &ksize));
+  TF_CHECK_OK(GetNodeAttr(orign->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orign->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orign->def(), "data_format", &data_format));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("ksize", ksize);
+  nb->Attr("strides", strides);
+  nb->Attr("padding", padding);
+  nb->Attr("data_format", data_format);
+}
+
+void MklLayoutRewritePass::CopyAttrsRelu(const Node* orign, NodeBuilder* nb) {
+  DataType T;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orign->def(), "T", &T));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+}
+
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions related to node merge pass
+//////////////////////////////////////////////////////////////////////////
+
+Node* MklLayoutRewritePass::CheckForNodeMerge(const Node* a) const {
+  // TODO(nhasabni) Add check for type of node similar to CheckForNodeRewrite
+  // once we support BiasAddGrad as Mkl layer.
+
+  // Search for all matching mergeinfo.
+  // We allow more than one match for extensibility.
+  std::vector<const MergeInfo*> matching_mi;
+  for (auto mi = minfo_.cbegin(); mi != minfo_.cend(); ++mi) {
+    if (a->type_string() == mi->succ) {
+      matching_mi.push_back(&*mi);
+    }
+  }
+
+  for (const MergeInfo* mi : matching_mi) {
+    const int N_in = a->num_inputs();
+    if (mi->op >= N_in) {
+      continue;
+    }
+
+    // Get the control edges and input of node
+    gtl::InlinedVector<Node*, 4> a_control_edges;
+    gtl::InlinedVector<std::pair<Node*, int>, 4> a_in(N_in);
+    FillInputs(a, &a_control_edges, &a_in);
+
+    // Get operand op of the operator
+    Node* b = nullptr;
+    b = a_in[mi->op].first;
+    if (b == nullptr || (b->type_string() != mi->pred)) {
+      // NOTE: Should the first check be assert?
+      continue;
+    }
+
+    gtl::InlinedVector<Node*, 4> b_control_edges;
+    gtl::InlinedVector<std::pair<Node*, int>, 4> b_in(N_in);
+    FillInputs(b, &b_control_edges, &b_in);
+
+    // Shouldn't merge if a and b have different control edges.
+    if (a_control_edges != b_control_edges) {
+      continue;
+    } else {
+      // We found a match.
+      return b;
+    }
+  }
+
+  return nullptr;
+}
+
+Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* succ,
+                                       Node* pred) {
+  CHECK_NOTNULL(succ);
+  CHECK_NOTNULL(pred);
+
+  if (succ->type_string() == csinfo_.biasadd &&
+      pred->type_string() == csinfo_.mklconv2d) {
+    // 1. Get all attributes from input nodes.
+    DataType T_pred, T_succ;
+    string padding;
+    std::vector<int32> strides;
+    string data_format_pred, data_format_succ;
+    bool use_cudnn_on_gnu;
+    TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
+    TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
+    TF_CHECK_OK(GetNodeAttr(pred->def(), "padding", &padding));
+    TF_CHECK_OK(GetNodeAttr(pred->def(), "strides", &strides));
+    TF_CHECK_OK(GetNodeAttr(pred->def(), "data_format", &data_format_pred));
+    TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
+    TF_CHECK_OK(
+        GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
+    // We check to ensure that data formats of both succ and pred are same.
+    // We expect them to be same, so we can enforce this as assert.
+    // But assert can be too strict, so we enforce this as a check.
+    // If the check fails, then we do not merge two nodes.
+    // We also do same check for devices.
+    if (data_format_pred != data_format_succ || T_pred != T_succ ||
+        pred->assigned_device_name() != succ->assigned_device_name() ||
+        pred->def().device() != succ->def().device()) {
+      return Status(error::Code::INVALID_ARGUMENT,
+                    "data_format or T attribute or devices of Conv2D and "
+                    "BiasAdd do not match. Will skip node merge optimization");
+    }
+
+    const int succ_num = succ->num_inputs();
+    gtl::InlinedVector<Node*, 4> succ_control_edges;
+    gtl::InlinedVector<std::pair<Node*, int>, 4> succ_in(succ_num);
+    FillInputs(succ, &succ_control_edges, &succ_in);
+
+    const int pred_num = pred->num_inputs();
+    gtl::InlinedVector<Node*, 4> pred_control_edges;
+    gtl::InlinedVector<std::pair<Node*, int>, 4> pred_in(pred_num);
+    FillInputs(pred, &pred_control_edges, &pred_in);
+
+    // We need to ensure that there is only 1 edge between Conv2D and AddBias.
+    // Otherwise, merging is semantically incorrect.
+    if (pred->out_edges().size() != 1) {
+      return Status(error::Code::INVALID_ARGUMENT,
+                    "Conv2D has multiple outputs."
+                    "Will skip node merge optimization");
+    }
+
+    for (const Edge* e : pred->out_edges()) {
+      if (e->dst() != succ) {
+        return Status(error::Code::INVALID_ARGUMENT,
+                      "Conv2D does not feed to BiasAdd."
+                      "Will skip node merge optimization");
+      }
+    }
+
+    // 2. Get inputs from both the nodes.
+    // Find the 2 inputs from the conv and the bias from the add Bias.
+    // Get operand 0, 1 of conv2D and their Mkl tensors.
+    CHECK_EQ(pred->in_edges().size(), 4);  // MklConv2D must have 4 inputs.
+    // Get operand 1 of add_bias
+    // BiasAdd must have 2 inputs: Conv, bias
+    CHECK_EQ(succ->in_edges().size(), 2);
+    Node* oper3_mkl = nullptr;  // Mkl tensor corresponding to oper3
+    int oper3_mkl_slot = 0;     // For dummy MKL tensor node, output slot is 0.
+    GetDummyMklTensorNode(g, &oper3_mkl, succ);  // Get dummy Mkl tensor node
+    // as BiasAdd does not have Mkl tensor as input.
+    CHECK_NOTNULL(oper3_mkl);
+
+    // We will use the node name of BiasAdd as the name of new node
+    // Build new node. We use same name as original node, but change the op
+    // name.
+    NodeBuilder nb(succ->name(), csinfo_.mklconv2dwithbias);
+    nb.Input(pred_in[0].first, pred_in[0].second);  // In1 of Conv2D
+    nb.Input(pred_in[1].first, pred_in[1].second);  // Mkl for In1
+    nb.Input(pred_in[2].first, pred_in[2].second);  // In2 of Conv2D
+    nb.Input(pred_in[3].first, pred_in[3].second);  // Mkl for In2
+    nb.Input(succ_in[1].first, succ_in[1].second);  // In2 of BiasAdd
+    nb.Input(oper3_mkl, oper3_mkl_slot);            // Mkl for In2 of BiasAdd
+
+    // Copy attributes from Conv2D to Conv2DWithBias.
+    CopyAttrsConv2D(const_cast<const Node*>(pred), &nb);
+
+    // Copy the device assigned to old node to new node.
+    nb.Device(succ->def().device());
+
+    // Create node.
+    Node* newn;
+    nb.Finalize(&**g, &newn);
+    CHECK_NOTNULL(newn);
+
+    // Set the Mkl layer label for this op.
+    newn->AddAttr("_kernel", mkl_layer_registry::kMklLayerLabel);
+
+    // Incoming edges are fixed, we will fix the outgoing edges now.
+    for (const Edge* e : succ->out_edges()) {
+      (*g)->AddEdge(newn, e->src_output(), e->dst(), e->dst_input());
+    }
+
+    // Copy device assigned to old node to new node.
+    // It's ok to use pred or succ as we have enforced a check that
+    // both have same device assigned.
+    newn->set_assigned_device_name(pred->assigned_device_name());
+
+    VLOG(1) << "MklLayoutRewritePass: Merged old node:" << pred->DebugString()
+            << ", and node: " << succ->DebugString()
+            << ", into node:" << newn->DebugString();
+
+    (*g)->RemoveNode(succ);
+    (*g)->RemoveNode(pred);
+    MarkRewrittenNode(newn);
+
+    return Status::OK();
+  }
+
+  return Status(error::Code::UNIMPLEMENTED,
+                "Unimplemented case for node merge optimization.");
+}
+
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions for node rewrite
+//////////////////////////////////////////////////////////////////////////
+
 Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g, Node* orign,
-                                         const NodesInfo& ni) {
-  VLOG(1) << "MKLLayoutRewritePass: Original node:" << orign->DebugString();
+                                         const RewriteInfo* ri) {
+  CHECK_NOTNULL(ri);
+  CHECK_NOTNULL(orign);
+
+  VLOG(1) << "MklLayoutRewritePass: Original node:" << orign->DebugString();
+
+  // Check if this is scenario 2 (context-based rewrite).
+  // Get the matching ContextInfo if it is.
+  const Node* fwdn = nullptr;
+  const ContextInfo* ci = nullptr;
+  bool is_context_based_rewrite = false;
+  if ((ci = SearchMatchingContext(orign, &fwdn)) != nullptr) {
+    CHECK_NOTNULL(fwdn);
+    is_context_based_rewrite = true;
+
+    // Sanity checks for context-based rewrite (if any)
+    if (orign->type_string() == csinfo_.biasaddgrad &&
+        ri->newname == csinfo_.mklconv2dwithbiasbackpropbias) {
+      DataType orig_T, ctx_T;
+      string orig_data_format, ctx_data_format;
+      TF_CHECK_OK(GetNodeAttr(orign->def(), "T", &orig_T));
+      TF_CHECK_OK(GetNodeAttr(orign->def(), "data_format", &orig_data_format));
+      TF_CHECK_OK(GetNodeAttr(fwdn->def(), "T", &ctx_T));
+      TF_CHECK_OK(GetNodeAttr(fwdn->def(), "data_format", &ctx_data_format));
+
+      if (orig_data_format != ctx_data_format || orig_T != ctx_T ||
+          orign->assigned_device_name() != fwdn->assigned_device_name() ||
+          orign->def().device() != fwdn->def().device()) {
+        return Status(
+            error::Code::INVALID_ARGUMENT,
+            "data_format or T attribute or devices of BiasAddGrad and "
+            "Conv2D do not match. Will skip node rewrite optimization");
+      }
+    }
+  }
 
   // Get all inputs.
   const int num = orign->num_inputs();
-  CHECK_EQ(num, ni.numins);
+  CHECK_EQ(num, ri->numins);
   gtl::InlinedVector<Node*, 4> control_edges;
   gtl::InlinedVector<std::pair<Node*, int>, 4> inputs(num);
   FillInputs(orign, &control_edges, &inputs);
 
   // Build new node. We use same name as original node, but change the op name.
-  NodeBuilder nb(orign->name().c_str(), ni.newname.c_str());
+  NodeBuilder nb(orign->name().c_str(), ri->newname.c_str());
   // Copy user-specified device assigned to original node to new node.
   nb.Device(orign->def().device());
   // Set up new inputs to the rewritten node.
@@ -453,20 +1064,48 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g, Node* orign,
   if (s != Status::OK()) {
     return s;
   }
-  // Copy attributes from original node to new node.
-  ni.copyattrs(orign, &nb);
+
+  // Copy attributes from original node to new node (for scenario 1).
+  // For context-based rewrite, we use context to copy the attributes.
+  if (is_context_based_rewrite) {
+    if (orign->type_string() == csinfo_.biasaddgrad &&
+        ri->newname == csinfo_.mklconv2dwithbiasbackpropbias) {
+      CHECK_NOTNULL(fwdn);
+      ri->copyattrs(fwdn, &nb);
+    } else {
+      return Status(error::Code::UNIMPLEMENTED,
+                    "Unimplemented case for node rewrite optimization.");
+    }
+  } else {
+    ri->copyattrs(const_cast<const Node*>(orign), &nb);
+  }
   // Set the Mkl layer label for this op.
   nb.Attr("_kernel", mkl_layer_registry::kMklLayerLabel);
-  Node* newn = nullptr;
+
+  // Add workspace edge to this node if needed.
+  // We add workspace edge only for MaxPool, LRN and BatchNorm.
+  AddWorkSpaceEdgeIfNeeded(g, orign, &nb);
 
   // Finalize graph and get new node.
+  Node* newn = nullptr;
   TF_CHECK_OK(nb.Finalize(&**g, &newn));
   CHECK_NOTNULL(newn);
 
   // Incoming edges from 'orign' node to new 'newn' node are already copied
   // in BuildNode. Copy outgoing edges from 'orign' node to new 'newn' node.
+  // Since the output also follows same ordering among Tensorflow tensors and
+  // Mkl tensors. We need to connect Tensorflow tensors appropriately.
+  // Specifically, nth output of original node will become 2*nth output of
+  // Mkl node. GetTensorDataIndex provides this mapping function.
   for (const Edge* e : orign->out_edges()) {
-    (*g)->AddEdge(newn, e->src_output(), e->dst(), e->dst_input());
+    // We need to handle control-edges by using their original slot number.
+    // Generally, -1 is reserved for control slot.
+    if (e->src_output() < 0) {
+      (*g)->AddEdge(newn, e->src_output(), e->dst(), e->dst_input());
+    } else {
+      (*g)->AddEdge(newn, GetTensorDataIndex(e->src_output()), e->dst(),
+                    e->dst_input());
+    }
   }
 
   // Copy the runtime device assigned from original code to new node.
@@ -476,10 +1115,123 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g, Node* orign,
   (*g)->RemoveNode(orign);
   MarkRewrittenNode(newn);
 
-  VLOG(1) << "MKLLayoutRewritePass: New node:" << newn->DebugString();
+  VLOG(1) << "MklLayoutRewritePass: New node:" << newn->DebugString();
   return Status::OK();
 }
 
+const MklLayoutRewritePass::ContextInfo*
+MklLayoutRewritePass::SearchMatchingContext(const Node* n, const Node** fwdn) {
+  CHECK_NOTNULL(n);
+  CHECK_NOTNULL(fwdn);
+  *fwdn = nullptr;
+
+  // Search for matching contextinfo based on node name.
+  // There could be more than one matching contextinfos.
+  bool is_matching_cinfo_found = false;
+  std::vector<const ContextInfo*> mci;
+  for (auto ci = cinfo_.cbegin(); ci != cinfo_.cend(); ++ci) {
+    if (n->type_string() == ci->node) {
+      mci.push_back(&*ci);
+      is_matching_cinfo_found = true;
+    }
+  }
+  // If no matching contextinfo is found, return immediately.
+  if (!is_matching_cinfo_found) {
+    return nullptr;
+  }
+
+  VLOG(1) << "MklLayoutRewritePass: Searching graph for: " << n->type_string()
+          << " in backwards.";
+
+  // Now we will check for forward op name for context info in data
+  // flow graph. Get the max hops we should search for the fwd node.
+  // We are now going to search (breadth-first) backwards in data
+  // dependence graph (for up to max hops) from n for the node
+  // specified in fwd.
+  // queue to maintain nodes to be visited and depth info for
+  // breadth-first search
+  std::queue<std::pair<const Node*, int>> nqueue;
+  const Node* curr_node = n;
+  size_t curr_depth = 0;
+  nqueue.push(std::make_pair(curr_node, curr_depth));
+
+  while (curr_depth < kNodeMergeContextMaxDepth && !nqueue.empty()) {
+    std::pair<const Node*, int> curr_pair = nqueue.front();
+    nqueue.pop();
+
+    std::set<const Node*> visited_nodes;
+    curr_node = curr_pair.first;
+    curr_depth = curr_pair.second;
+    CHECK_NOTNULL(curr_node);
+
+    VLOG(1) << "MklLayoutRewritePass: Visiting node: "
+            << curr_node->type_string() << " at depth: " << curr_depth
+            << " for node: " << n->type_string();
+
+    // If we find a match, we return immediately.
+    for (const ContextInfo* ci : mci) {
+      if (curr_node->type_string() == ci->fwd) {
+        *fwdn = curr_node;
+        return ci;
+      }
+    }
+
+    // Else we explore backward edges from current node.
+    // Add the source nodes of all incoming edges of the node to the queue.
+    for (const Edge* e : curr_node->in_edges()) {
+      // We do not visit already visited node.
+      if (visited_nodes.find(e->src()) == visited_nodes.end()) {
+        // Depth of these nodes is 1 more than the depth of current node.
+        nqueue.push(std::make_pair(e->src(), curr_depth + 1));
+        visited_nodes.insert(e->src());
+      }
+    }
+  } /* while */
+
+  return nullptr;
+}
+
+bool MklLayoutRewritePass::ContextMatchRewrite(const Node* n) {
+  const Node* fwdn = nullptr;
+  return SearchMatchingContext(n, &fwdn) != nullptr;
+}
+
+const MklLayoutRewritePass::RewriteInfo*
+MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
+  CHECK_NOTNULL(n);
+
+  // First check if node along with its type is supported by MKL layer.
+  // We do not want to rewrite an op into Mkl op if types are not supported.
+  // E.g., MklRelu does not support INT32. So we cannot rewrite Relu to
+  // MklRelu if type is INT32.
+  DataType T;
+  if (!GetNodeAttr(n->def(), "T", &T).ok()) {
+    return nullptr;
+  }
+  if (!mkl_layer_registry::IsMklLayer(GetMklOpName(n->type_string()), T)) {
+    return nullptr;
+  }
+
+  // We support 2 types of node rewrites:
+  // 1. Rewriting BiasAddGrad depending on its context.
+  // 2. Rewriting an op to Mkl op always
+  // We return true if any of these 2 conditions is met.
+
+  // Find matching RewriteInfo and then check that rewrite rule applies.
+  for (auto ri = rinfo_.cbegin(); ri != rinfo_.cend(); ++ri) {
+    if (n->type_string().compare(ri->name) == 0 && ri->rewriterule(n)) {
+      return &*ri;
+    }
+  }
+
+  // Else return not found.
+  return nullptr;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//              Run function for the pass
+///////////////////////////////////////////////////////////////////////////////
+
 bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
   bool result = false;
   CHECK_NOTNULL(g);
@@ -494,40 +1246,46 @@ bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
       continue;
     }
 
-    for (const NodesInfo& ni : ninfo_) {
-      DataType dtype = DT_INVALID;
-      // An op needs to have data type (T) attribute and its corresponding
-      // Mkl op name must be supported.
-      if (GetNodeAttr(n->def(), "T", &dtype) == Status::OK() &&
-          mkl_layer_registry::IsMklLayer(GetMklOpName(n->type_string())) &&
-          n->type_string().compare(ni.name) == 0) {
-        string node_name = n->name();
-        string op_name = n->type_string();
-
-        VLOG(1) << "MKLLayoutRewritePass: Scheduled node " << node_name
-                << " with op " << op_name << " for rewrite using"
-                << " layout optimization.";
-
-        if (RewriteNode(g, n, ni) == Status::OK()) {
-          VLOG(1) << "MKLLayoutRewritePass: Successfully rewrote node "
-                  << node_name << " with op " << op_name
-                  << " for Mkl layout optimization.";
-          result = true;
-          break;  // We found matching nodesinfo so no need to search next.
-        }
+    const RewriteInfo* ri = nullptr;
+    Node* predn = nullptr;
+    // We will first search if node is to be rewritten
+    if ((ri = CheckForNodeRewrite(n)) != nullptr) {
+      string node_name = n->name();
+      string op_name = n->type_string();
+
+      VLOG(1) << "MklLayoutRewritePass: Scheduled node " << node_name
+              << " with op " << op_name << " for rewrite using"
+              << " layout optimization.";
+
+      if (RewriteNode(g, n, ri) == Status::OK()) {
+        VLOG(1) << "MklLayoutRewritePass: rewrote node " << node_name
+                << " with op " << op_name << " for Mkl layout optimization.";
+        result = true;
+      }
+    } else if ((predn = CheckForNodeMerge(n)) != nullptr) {
+      // Otherwise, we will check if the node is to be merged.
+      string n1_name = n->name();
+      string n2_name = predn->name();
+
+      VLOG(1) << "MklLayoutRewritePass: Scheduled nodes " << n1_name << " and "
+              << n2_name << " for merging";
+
+      if (MergeNode(g, n, predn) == Status::OK()) {
+        VLOG(1) << "MklLayoutRewritePass: Merged nodes " << n1_name << " and "
+                << n2_name;
+        result = true;
       }
     }
   }
 
   DumpGraph("After running MklLayoutRewritePass", &**g);
 
+  // Clear marked nodes as the same graph pass may be used multiple times.
+  UnMarkRewrittenNodes();
+
   return result;
 }
 
-///////////////////////////////////////////////////////////////////////////////
-//              Run function for the pass
-///////////////////////////////////////////////////////////////////////////////
-
 bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g) {
   return MklLayoutRewritePass().RunPass(g);
 }
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 10671ee2e9..142d60d611 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -18,7 +18,10 @@ limitations under the License.
 #include "tensorflow/core/graph/mkl_layout_pass.h"
 #include "tensorflow/core/util/mkl_util.h"
 
+#include <algorithm>
+#include <string>
 #include <vector>
+
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
@@ -107,10 +110,345 @@ class MklLayoutPassTest : public ::testing::Test {
 };
 
 REGISTER_OP("Input").Output("o: float").SetIsStateful();
+REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
+REGISTER_OP("MklInput").Output("o: uint8").SetIsStateful();
+REGISTER_OP("MklInput2").Output("o: uint8").Output("o1: uint8").SetIsStateful();
+
+/////////////////////////////////////////////////////////////////////
+//  Unit tests related to node merge optiimization
+/////////////////////////////////////////////////////////////////////
+
+TEST_F(MklLayoutPassTest, Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Mul);D(Mul)|"
+            "A->C;A->D;B->C:1;B->D:1");
+}
+
+// Test set 1: Conv2D + AddBias
+
+// C=MklConv2D(A,M,B,N); E=BiasAdd(C,D); Z=Sub(E,Y)
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'M', 'B', 'N']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);D(Input);DMT/_0(Const);E(MklConv2DWithBias);"
+            "M(MklInput);N(MklInput);Y(Input);Z(Sub)|A->E;B->E:2;D->E:4;"
+            "DMT/_0->E:5;E->Z;M->E:1;N->E:3;Y->Z:1");
+}
+
+// C=MklConv2D(A,M:1,B,N:1); E=BiasAdd(C,D); Z=Sub(E,Y)
+// Test for correct output slots selected
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput2'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'MklInput2'}"
+      "node { name: 'C' op: 'MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'M:1', 'B', 'N:1']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);D(Input);DMT/_0(Const);E(MklConv2DWithBias);"
+            "M(MklInput2);N(MklInput2);Y(Input);Z(Sub)|A->E;B->E:2;D->E:4;"
+            "DMT/_0->E:5;E->Z;M:1->E:1;N:1->E:3;Y->Z:1");
+}
+
+// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y);
+// This is a case of node rewrite followed by node merge.
+// We will first rewrite Conv2D to MklConv2D, and then merge MklConv2D
+// with BiasAdd to produce MklConv2DWithBias.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(MklConv2DWithBias);Y(Input);Z(Sub)|"
+            "A->E;B->E:2;D->E:4;DMT/_0->E:1;DMT/_1->E:3;DMT/_2->E:5;"
+            "E->Z;Y->Z:1");
+}
+
+// Graph contains only MklConv2D, no AddBias.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_NoAddBias) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'M', 'B', 'N']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MklConv2D);M(MklInput);N(MklInput)|"
+            "A->C;B->C:2;M->C:1;N->C:3");
+}
+
+// MklConv2D output does not go to BiasAdd.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'M', 'B', 'N']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D', 'E'] }");  // Output of MklConv2D does not go to BiasAdd.
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MklConv2D);D(Input);E(Input);F(BiasAdd);"
+            "M(MklInput);N(MklInput)|A->C;B->C:2;D->F;E->F:1;M->C:1;N->C:3");
+}
+
+// MklConv2D has two outgoing edges: BiasAdd and some other dummy node (Add).
+// Merge should not be done in such case.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'M', 'B', 'N']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D', 'E'] }"  // Conv2D has two outputs.
+                              // No merge should happen.
+      "node { name: 'G' op: 'Add'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['C', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MklConv2D);D(Input);E(Input);F(BiasAdd);"
+            "G(Add);M(MklInput);N(MklInput)|A->C;B->C:2;C->G;D->F;"
+            "E->F:1;E->G:1;M->C:1;N->C:3");
+}
+
+// data_format attribute value mismatch. Merge should not be done
+// in such case.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_AttrMismatch) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'M', 'B', 'N']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHCW' } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MklConv2D);D(Input);E(BiasAdd);M(MklInput);"
+            "N(MklInput)|A->C;B->C:2;C->E;D->E:1;M->C:1;N->C:3");
+}
+
+// No MklConv2D in context, but Conv2D in context.
+// Only Conv2D would be rewritten to MklConv2D, but no rewrite
+// for BiasAddGrad should happen.
+// C=MklConv2D(A,M,B,N); D=Sub(C,A); E=BiasAddGrad(D)
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Neg_NoMklConv2DWithBias) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'M', 'B', 'N']}"
+      "node { name: 'D' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MklConv2D);D(Sub);E(BiasAddGrad);"
+            "M(MklInput);N(MklInput)|A->C;A->D:1;B->C:2;C->D;D->E;"
+            "M->C:1;N->C:3");
+}
+
+// No Conv2D in the context for BiasAddGrad. No rewrite should happen.
+// C=Add(A,B); D=Sub(C,A); E=BiasAddGrad(D)
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Add'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Add);D(Sub);E(BiasAddGrad)|"
+            "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+// No Conv2D in the context for BiasAddGrad, but MatMul in context.
+// Rewrite should happen, but name of BiasAddGrad does not change.
+// C=MatMul(A,B); D=Sub(C,A); E=BiasAddGrad(D)
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D_MatMul) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'MatMul'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'transpose_a'      value { b: false } }"
+      " attr { key: 'transpose_b'      value { b: false } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MatMul);D(Sub);E(BiasAddGrad)|"
+            "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+// Test set 3: MatMul..BiasAddGrad -> BiasAddGrad rewrite tests
+// C=MatMul(A,B); D=Sub(C,A); E=BiasAddGrad(D)
+TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'MatMul'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'transpose_a'      value { b: false } }"
+      " attr { key: 'transpose_b'      value { b: false } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MatMul);D(Sub);E(BiasAddGrad)|"
+            "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+// No MatMul in the context for BiasAddGrad. No rewrite should happen.
+// C=Add(A,B); D=Sub(C,A); E=BiasAddGrad(D)
+TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Negative_NoMatMul) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Add'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Add);D(Sub);E(BiasAddGrad)|"
+            "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+/////////////////////////////////////////////////////////////////////
+//  Unit tests related to rewriting node to Mkl node
+/////////////////////////////////////////////////////////////////////
 
 // Single Conv2D Op; No Mkl layer on the input and on the output.
 // We will generate dummy Mkl tensor as 2nd input of Conv2D.
-TEST_F(MklLayoutPassTest, Conv2D_Basic) {
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Basic) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
       "node { name: 'B' op: 'Input'}"
@@ -130,7 +468,7 @@ TEST_F(MklLayoutPassTest, Conv2D_Basic) {
 
 // 2 Conv2D Ops in sequence. Both should get transformed and 1st Conv2D will
 // have 2 outputs, both of which will be inputs to next Conv2D.
-TEST_F(MklLayoutPassTest, Conv2D_Positive1) {
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
       "node { name: 'B' op: 'Input'}"
@@ -156,6 +494,104 @@ TEST_F(MklLayoutPassTest, Conv2D_Positive1) {
             "C:1->D:3;D->E:1;DMT/_0->C:1;DMT/_1->C:3;DMT/_2->D:1");
 }
 
+// Conv2D with INT32 which is not supported by Mkl
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Negative_UnsupportedType) {
+  InitGraph(
+      "node { name: 'A' op: 'HalfInput'}"
+      "node { name: 'B' op: 'HalfInput'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_HALF } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_HALF } }"
+      " input: ['B', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(HalfInput);B(HalfInput);C(Conv2D);D(Mul)|"
+            "A->C;B->C:1;B->D;C->D:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+//  Unit tests related to rewriting node for workspace edges
+/////////////////////////////////////////////////////////////////////
+
+/* Test MaxPool->MaxPoolGrad replacement by workspace+rewrite nodes. */
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'MaxPoolGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['C', 'B', 'D'] }"
+      "node { name: 'F' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MklMaxPool);C(Input);D(Input);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(MklMaxPoolGrad);F(Mul)|"
+            "A->B;B->E:2;B:1->E:3;B:2->E:6;B:3->E:7;C->E;C->F;D->E:4;"
+            "DMT/_0->B:1;DMT/_1->E:1;DMT/_2->E:5;E->F:1");
+}
+
+// Test MaxPool>MaxPoolGrad replacement when only one of them is present.
+// In this case, we will rewrite MaxPool node but workspace edges will not
+// be present.
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MklMaxPool);C(Mul);DMT/_0(Const)|"
+            "A->B;A->C;B->C:1;DMT/_0->B:1");
+}
+
+// Test MaxPool->MaxPoolGrad replacement when only one of them is present.
+// In this case, we will rewrite MaxPoolGrad and for workspace tensor and
+// its Mkl part, we will generate dummy tensor.
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'MaxPoolGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(MklMaxPoolGrad);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Mul)|"
+            "A->D;A->E;B->D:2;C->D:4;D->E:1;DMT/_0->D:1;DMT/_1->D:3;"
+            "DMT/_2->D:5;DMT/_3->D:6;DMT/_4->D:7");
+}
+
+/////////////////////////////////////////////////////////////////////
+
 static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
   testing::StopTiming();
   string s;
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.cc b/tensorflow/core/graph/mkl_tfconversion_pass.cc
index 8c3adad6f0..7c3836b308 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc
@@ -81,9 +81,10 @@ class MklToTfConversionPass : public GraphOptimizationPass {
   // Is the input Op supported by Mkl-specific layout?
   //
   // @input op_name string of the op
+  // @input T Datatype to use for checking input op
   // @return true if op is Mkl supported; false, otherwise.
-  inline bool IsMklSupportedOp(const string& op_name) const {
-    return mkl_layer_registry::IsMklLayer(op_name);
+  inline bool IsMklSupportedOp(const string& op_name, DataType T) const {
+    return mkl_layer_registry::IsMklLayer(op_name, T);
   }
 
   // Insert layout conversion node on the edge pointed by 'e' from graph 'g'.
@@ -188,6 +189,13 @@ bool MklToTfConversionPass::RunPass(std::unique_ptr<Graph>* g) {
       continue;
     }
 
+    // We skip adding MklToTf on an edge between X->MklToTf or
+    // MklToTf->X, where X is any layer.
+    if (src->type_string().compare("MklToTf") == 0 ||
+        dst->type_string().compare("MklToTf") == 0) {
+      continue;
+    }
+
     VLOG(1) << "MklToTfConversionPass: InsertConversionNodes: "
             << src->type_string() << " and " << dst->type_string();
 
@@ -202,8 +210,9 @@ bool MklToTfConversionPass::RunPass(std::unique_ptr<Graph>* g) {
     GetNodeAttr(dst->def(), "T", &dst_datatype);
 
     // Check if src with is Mkl-compliant, while dst is not Mkl-compliant.
-    if (IsMklSupportedOp(src->type_string()) &&
-        !IsMklSupportedOp(dst->type_string())) {
+
+    if (IsMklSupportedOp(src->type_string(), src_datatype) &&
+        !IsMklSupportedOp(dst->type_string(), dst_datatype)) {
       VLOG(1) << "MklToTfConversionPass: Scheduled nodes " << src->name()
               << " and " << dst->name() << " for inserting conversion nodes";
       candidate_edges.push_back(const_cast<Edge*>(e));
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
index 0a63cf6ddb..7d9237f845 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
@@ -17,7 +17,10 @@ limitations under the License.
 
 #include "tensorflow/core/graph/mkl_tfconversion_pass.h"
 
+#include <algorithm>
+#include <string>
 #include <vector>
+
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
@@ -146,31 +149,34 @@ TEST_F(MklToTfConversionPass, Positive) {
             "C:1->Mkl2Tf/_0:1;D->E:1;M->C:1;Mkl2Tf/_0->E;N->C:3");
 }
 
-// MklConv2D followed by Non-Mkl layer, and MklConv2D uses half type
-// C=MklConv2D(A,M,B,N); E=Sub(C,D)
-// MklToTf node should be inserted.
-TEST_F(MklToTfConversionPass, Positive_Type) {
+// MklConv2D followed by MklToTf op followed by Non-Mkl layer.
+// C=MklConv2D(A,M,B,N); D=MklToTf(C:0, C:1) F=Sub(D,E)
+// MklToTf node should not be inserted again.
+TEST_F(MklToTfConversionPass, Negative_DoubleInsert) {
   InitGraph(
-      "node { name: 'A' op: 'HalfInput'}"
+      "node { name: 'A' op: 'Input'}"
       "node { name: 'M' op: 'MklInput'}"
-      "node { name: 'B' op: 'HalfInput'}"
+      "node { name: 'B' op: 'Input'}"
       "node { name: 'N' op: 'MklInput'}"
       "node { name: 'C' op: 'MklConv2D'"
-      " attr { key: 'T'                value { type: DT_HALF } }"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
       " input: ['A', 'M', 'B', 'N']}"
-      "node { name: 'D' op: 'HalfInput'}"
-      "node { name: 'E' op: 'Sub'"
-      " attr {key: 'T'                 value { type: DT_HALF } }"
-      " input: ['C', 'D']}");
+      "node { name: 'D' op: 'MklToTf'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C:0', 'C:1']}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'E']}");
   EXPECT_EQ(DoRunMklToTfConversionPass(),
-            "A(HalfInput);B(HalfInput);C(MklConv2D);D(HalfInput);"
-            "E(Sub);M(MklInput);Mkl2Tf/_0(MklToTf);N(MklInput)|"
-            "A->C;B->C:2;C->Mkl2Tf/_0;C:1->Mkl2Tf/_0:1;D->E:1;"
-            "M->C:1;Mkl2Tf/_0->E;N->C:3");
+            "A(Input);B(Input);C(MklConv2D);D(MklToTf);E(Input);"
+            "F(Sub);M(MklInput);N(MklInput)|"
+            "A->C;B->C:2;C->D;C:1->D:1;D->F;E->F:1;M->C:1;N->C:3");
 }
 
 // C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y);
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 9f516efd71..9c47d520d9 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -27,6 +27,7 @@ load(
     "tf_copts",
     "tf_opts_nortti_if_android",
     "tf_kernel_library",
+    "tf_mkl_kernel_library",
     "cc_header_only_library",
 )
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
@@ -2241,6 +2242,12 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "matmul_op",
+    srcs = [
+        "matmul_op.cc",
+    ] + if_mkl([
+        "mkl_matmul_op.cc",
+    ]),
+    hdrs = ["matmul_op.h"],
     defines = select({
         ":xsmm": [
             "TENSORFLOW_USE_LIBXSMM",
@@ -2248,13 +2255,14 @@ tf_kernel_library(
         ],
         "//conditions:default": [],
     }),
-    prefix = "matmul_op",
     deps = MATH_DEPS + select({
         ":xsmm": [
             "@libxsmm_archive//:xsmm_avx",
         ],
         "//conditions:default": [],
-    }),
+    }) + if_mkl([
+        "//third_party/mkl:intel_binary_blob",
+    ]),
 )
 
 tf_kernel_library(
@@ -2770,6 +2778,7 @@ tf_kernel_library(
         "cudnn_pooling_gpu.h",
         "fractional_pool_common.h",
         "maxpooling_op.h",
+        "pooling_ops_3d.h",
         "pooling_ops_common.h",
     ],
     gpu_srcs = [
@@ -2780,6 +2789,8 @@ tf_kernel_library(
         "maxpooling_op_gpu.h",
         "pooling_ops_common.h",
         "pooling_ops_common_gpu.h",
+        "pooling_ops_3d_gpu.h",
+        "pooling_ops_3d_gpu.cu.cc",
     ],
     deps = [
         ":conv_2d",
@@ -4468,49 +4479,69 @@ tf_cc_test(
     ],
 )
 
-if_mkl(
-    tf_kernel_library(
-        name = "mkl_matmul_op",
-        prefix = "mkl_matmul",
-        deps = [
-            ":math",
-            "//third_party/mkl:intel_binary_blob",
-        ],
-    ),
+tf_mkl_kernel_library(
+    name = "mkl_conv_op",
+    prefix = "mkl_conv",
+    deps = [
+        ":bounds_check",
+        ":conv_ops",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//third_party/mkl:intel_binary_blob",
+    ],
 )
 
-if_mkl(
-    tf_kernel_library(
-        name = "mkl_conv_op",
-        prefix = "mkl_conv",
-        deps = [
-            ":bounds_check",
-            ":ops_util",
-            "//tensorflow/core:core_cpu",
-            "//tensorflow/core:framework",
-            "//tensorflow/core:lib",
-            "//tensorflow/core:lib_internal",
-            "//tensorflow/core:nn_ops_op_lib",
-            "//third_party/mkl:intel_binary_blob",
-        ],
-    ),
+tf_mkl_kernel_library(
+    name = "mkl_tfconv_op",
+    prefix = "mkl_tfconv",
+    deps = [
+        ":bounds_check",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//third_party/mkl:intel_binary_blob",
+    ],
 )
 
-if_mkl(
-    tf_kernel_library(
-        name = "mkl_tfconv_op",
-        prefix = "mkl_tfconv",
-        deps = [
-            ":bounds_check",
-            ":ops_util",
-            "//tensorflow/core:core_cpu",
-            "//tensorflow/core:framework",
-            "//tensorflow/core:lib",
-            "//tensorflow/core:lib_internal",
-            "//tensorflow/core:nn_ops_op_lib",
-            "//third_party/mkl:intel_binary_blob",
-        ],
-    ),
+tf_mkl_kernel_library(
+    name = "mkl_pooling_ops",
+    srcs = [
+        "mkl_avgpooling_op.cc",
+        "mkl_maxpooling_op.cc",
+        "mkl_pooling_ops_common.cc",
+    ],
+    hdrs = ["mkl_pooling_ops_common.h"],
+    deps = [
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//third_party/mkl:intel_binary_blob",
+    ],
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_relu_op",
+    prefix = "mkl_relu",
+    deps = [
+        ":bounds_check",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//third_party/mkl:intel_binary_blob",
+    ],
 )
 
 # -----------------------------------------------------------------------------
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 2e385f2c55..f88862bfeb 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -30,6 +30,9 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/conv_2d.h"
+#ifdef TENSORFLOW_USE_LIBXSMM
+#include "tensorflow/core/kernels/xsmm_conv2d.h"
+#endif
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -88,6 +91,75 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
+#ifdef TENSORFLOW_USE_LIBXSMM
+template <typename Device, class T>
+struct LaunchXsmmBackwardFilter {
+  bool operator()(OpKernelContext* context, const Device& d,
+                  typename TTypes<T, 4>::ConstTensor input_backward,
+                  typename TTypes<T, 4>::Tensor kernel,
+                  typename TTypes<T, 4>::ConstTensor output_backward,
+                  int input_rows, int input_cols, int row_stride,
+                  int col_stride, int pad_h, int pad_w,
+                  TensorFormat data_format) const {
+    return false;
+  }
+};
+
+template <>
+struct LaunchXsmmBackwardFilter<CPUDevice, float> {
+  bool operator()(OpKernelContext* context, const CPUDevice& d,
+                  typename TTypes<float, 4>::ConstTensor input,
+                  typename TTypes<float, 4>::Tensor filter,
+                  typename TTypes<float, 4>::ConstTensor output, int input_rows,
+                  int input_cols, int row_stride, int col_stride, int pad_h,
+                  int pad_w, TensorFormat data_format) const {
+    auto batch = input.dimension(0);
+    auto in_depth = input.dimension(3);
+    auto out_depth = output.dimension(3);
+    auto filter_rows = filter.dimension(0);
+    auto filter_cols = filter.dimension(1);
+
+    auto num_threads =
+        context->device()->tensorflow_cpu_worker_threads()->num_threads;
+    // See libxsmm_dnn.h for this struct definition.
+    libxsmm_dnn_conv_desc desc;
+    desc.N = batch;
+    desc.C = in_depth;
+    desc.H = input_rows;
+    desc.W = input_cols;
+    desc.K = out_depth;
+    desc.R = filter_rows;
+    desc.S = filter_cols;
+    desc.u = row_stride;
+    desc.v = col_stride;
+    desc.pad_h = pad_h;
+    desc.pad_w = pad_w;
+    desc.pad_h_in = 0;  // pad_rows;  // ignored by libxsmm for now.
+    desc.pad_w_in = 0;  // pad_cols;  // ignored by libxsmm for now.
+    desc.pad_h_out = 0;
+    desc.pad_w_out = 0;
+    desc.threads = num_threads;
+    desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
+    desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
+    desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
+    desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
+    desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
+    desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
+
+    if (!CanUseXsmmConv2D(desc, data_format)) {
+      return false;
+    }
+
+    auto input_ptr = input.data();
+    auto filter_ptr = filter.data();
+    auto output_ptr = output.data();
+    bool success = functor::XsmmBkwFilterConv2D<CPUDevice, float>()(
+        context, desc, input_ptr, filter_ptr, output_ptr);
+    return success;
+  }
+};
+#endif
+
 template <typename Device, class T>
 class Conv2DFastBackpropFilterOp : public OpKernel {
  public:
@@ -135,6 +207,36 @@ class Conv2DFastBackpropFilterOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, filter_shape, &filter_backprop));
 
+#if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
+
+    int64 pad_top, pad_bottom;
+    int64 pad_left, pad_right;
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+            dims.spatial_dims[0].stride, padding_,
+            &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+            dims.spatial_dims[1].stride, padding_,
+            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
+
+    if (pad_left == pad_right && pad_top == pad_bottom) {
+      if (LaunchXsmmBackwardFilter<Device, T>()(
+              context, context->eigen_device<Device>(), input.tensor<T, 4>(),
+              filter_backprop->tensor<T, 4>(), out_backprop.tensor<T, 4>(),
+              dims.spatial_dims[0].input_size, dims.spatial_dims[1].input_size,
+              (int)dims.spatial_dims[0].stride,
+              (int)dims.spatial_dims[1].stride, (int)pad_top, (int)pad_left,
+              data_format_)) {
+        return;
+      }
+    }
+#endif
+
     functor::SpatialConvolutionBackwardKernel<Device, T>()(
         context->eigen_device<Device>(), filter_backprop->tensor<T, 4>(),
         input.tensor<T, 4>(), out_backprop.tensor<T, 4>(),
@@ -213,6 +315,19 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
             dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
             dims.spatial_dims[1].stride, padding_,
             &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
+#if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
+    if (pad_left == pad_right && pad_top == pad_bottom) {
+      if (LaunchXsmmBackwardFilter<Device, T>()(
+              context, context->eigen_device<Device>(), input.tensor<T, 4>(),
+              filter_backprop->tensor<T, 4>(), out_backprop.tensor<T, 4>(),
+              dims.spatial_dims[0].input_size, dims.spatial_dims[1].input_size,
+              (int)dims.spatial_dims[0].stride,
+              (int)dims.spatial_dims[1].stride, (int)pad_top, (int)pad_left,
+              data_format_)) {
+        return;
+      }
+    }
+#endif
 
     // The total dimension size of each kernel.
     const int filter_total_size = dims.spatial_dims[0].filter_size *
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 8bc79bebd9..e79c9465cb 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -131,7 +131,8 @@ struct LaunchXsmmBackwardInputConvolution {
                   typename TTypes<T, 4>::ConstTensor kernel,
                   typename TTypes<T, 4>::ConstTensor output_backward,
                   int input_rows, int input_cols, int row_stride,
-                  int col_stride, TensorFormat data_format) const {
+                  int col_stride, int pad_h, int pad_w,
+                  TensorFormat data_format) const {
     return false;
   }
 };
@@ -143,7 +144,8 @@ struct LaunchXsmmBackwardInputConvolution<CPUDevice, float> {
                   typename TTypes<float, 4>::ConstTensor kernel,
                   typename TTypes<float, 4>::ConstTensor output_backward,
                   int input_rows, int input_cols, int row_stride,
-                  int col_stride, TensorFormat data_format) const {
+                  int col_stride, int pad_h, int pad_w,
+                  TensorFormat data_format) const {
     auto batch = input_backward.dimension(0);
     auto in_depth = input_backward.dimension(3);
     auto out_depth = output_backward.dimension(3);
@@ -162,10 +164,10 @@ struct LaunchXsmmBackwardInputConvolution<CPUDevice, float> {
     desc.S = filter_cols;
     desc.u = row_stride;
     desc.v = col_stride;
-    desc.pad_h = 0;
-    desc.pad_w = 0;
-    desc.pad_h_in = 0;  // pad_rows;  // ignored by libxsmm for now.
-    desc.pad_w_in = 0;  // pad_cols;  // ignored by libxsmm for now.
+    desc.pad_h = pad_h;
+    desc.pad_w = pad_w;
+    desc.pad_h_in = 0;
+    desc.pad_w_in = 0;
     desc.pad_h_out = 0;
     desc.pad_w_out = 0;
     desc.threads = num_threads;
@@ -174,7 +176,7 @@ struct LaunchXsmmBackwardInputConvolution<CPUDevice, float> {
     desc.filter_format =
         LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;  // LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
     desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
-    desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
+    desc.options = LIBXSMM_DNN_CONV_OPTION_WU_EXT_FILTER_REDUCE;
     desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
 
     auto input_ptr = input_backward.data();
@@ -236,13 +238,31 @@ class Conv2DFastBackpropInputOp : public OpKernel {
                    context->allocate_output(0, input_shape, &in_backprop));
 
 #if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
-    if (LaunchXsmmBackwardInputConvolution<Device, T>()(
-            context, context->eigen_device<Device>(),
-            in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
-            out_backprop.tensor<T, 4>(), dims.spatial_dims[0].input_size,
-            dims.spatial_dims[1].input_size, dims.spatial_dims[0].stride,
-            dims.spatial_dims[1].stride, data_format_)) {
-      return;
+    int64 pad_top, pad_bottom;
+    int64 pad_left, pad_right;
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+            dims.spatial_dims[0].stride, padding_,
+            &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+            dims.spatial_dims[1].stride, padding_,
+            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
+
+    if (pad_left == pad_right && pad_top == pad_bottom) {
+      if (LaunchXsmmBackwardInputConvolution<Device, T>()(
+              context, context->eigen_device<Device>(),
+              in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
+              out_backprop.tensor<T, 4>(), dims.spatial_dims[0].input_size,
+              dims.spatial_dims[1].input_size, (int)dims.spatial_dims[0].stride,
+              (int)dims.spatial_dims[1].stride, (int)pad_top, (int)pad_left,
+              data_format_)) {
+        return;
+      }
     }
 #endif
 
@@ -309,21 +329,39 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input_shape, &in_backprop));
 
+// TODO(andydavis) Consider moving code shared with
+// Conv2DCustomBackpropFilterOp into a shared helper function.
 #if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
-    if (LaunchXsmmBackwardInputConvolution<Device, T>()(
-            context, context->eigen_device<Device>(),
-            in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
-            out_backprop.tensor<T, 4>(), dims.spatial_dims[0].input_size,
-            dims.spatial_dims[1].input_size, dims.spatial_dims[0].stride,
-            dims.spatial_dims[1].stride, data_format_)) {
-      return;
-    }
-#endif
+    int64 pad_top, pad_bottom;
+    int64 pad_left, pad_right;
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+            dims.spatial_dims[0].stride, padding_,
+            &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+            dims.spatial_dims[1].stride, padding_,
+            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
 
-    // TODO(andydavis) Consider moving code shared with
-    // Conv2DCustomBackpropFilterOp into a shared helper function.
+    if (pad_left == pad_right && pad_top == pad_bottom) {
+      if (LaunchXsmmBackwardInputConvolution<Device, T>()(
+              context, context->eigen_device<Device>(),
+              in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
+              out_backprop.tensor<T, 4>(), dims.spatial_dims[0].input_size,
+              dims.spatial_dims[1].input_size, (int)dims.spatial_dims[0].stride,
+              (int)dims.spatial_dims[1].stride, (int)pad_top, (int)pad_left,
+              data_format_)) {
+        return;
+      }
+    }
+#else
     int64 pad_top, pad_bottom;
     int64 pad_left, pad_right;
+#endif
     OP_REQUIRES_OK(
         context,
         GetWindowedOutputSizeVerbose(
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index facfe4467d..8076daf387 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -213,8 +213,8 @@ class LaunchXsmmConvOp<CPUDevice, float> {
     desc.v = stride_cols;
     desc.pad_h = pad_rows;
     desc.pad_w = pad_cols;
-    desc.pad_h_in = pad_rows;  // libxsmm supports only physical padding for now
-    desc.pad_w_in = pad_cols;  // libxsmm supports only physical padding for now
+    desc.pad_h_in = 0;
+    desc.pad_w_in = 0;
     desc.pad_h_out = 0;
     desc.pad_w_out = 0;
     desc.threads = num_threads;
@@ -222,13 +222,17 @@ class LaunchXsmmConvOp<CPUDevice, float> {
     desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
     desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;
     desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
-    desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
+    desc.options = LIBXSMM_DNN_CONV_OPTION_WU_EXT_FILTER_REDUCE;
     desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
 
     if (!CanUseXsmmConv2D(desc, data_format)) {
       return false;
     }
 
+    if (!CanUseXsmmConv2D(desc, data_format)) {
+      return false;
+    }
+
     auto input_ptr = input.template flat<float>().data();
     auto filter_ptr = filter.template flat<float>().data();
     auto output_ptr = output->template flat<float>().data();
diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
index bb99d627a5..2307c2de0e 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@@ -548,9 +548,11 @@ template struct functor::TransformFilter<GPUDevice, Eigen::half, int, 4>;
 template struct functor::ReverseTransformFilter<GPUDevice, float, 4>;
 template struct functor::ReverseTransformFilter<GPUDevice, Eigen::half, 4>;
 
+template struct functor::NHWCToNCHW<GPUDevice, double, 4>;
 template struct functor::NHWCToNCHW<GPUDevice, float, 4>;
 template struct functor::NHWCToNCHW<GPUDevice, Eigen::half, 4>;
 
+template struct functor::NCHWToNHWC<GPUDevice, double, 4>;
 template struct functor::NCHWToNHWC<GPUDevice, float, 4>;
 template struct functor::NCHWToNHWC<GPUDevice, Eigen::half, 4>;
 
diff --git a/tensorflow/core/kernels/cudnn_pooling_gpu.cc b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
index 66f9249234..5939ecdf62 100644
--- a/tensorflow/core/kernels/cudnn_pooling_gpu.cc
+++ b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <array>
 
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/conv_3d.h"
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
@@ -242,8 +243,11 @@ void DnnPooling3dGradOp<T>::Compute(
   }
 }
 
-template class DnnPooling3dOp<float>;
-template class DnnPooling3dGradOp<float>;
+#define DEFINE_DNN_OPS(T)           \
+  template class DnnPooling3dOp<T>; \
+  template class DnnPooling3dGradOp<T>;
+TF_CALL_float(DEFINE_DNN_OPS) TF_CALL_half(DEFINE_DNN_OPS)
+#undef DEFINE_DNN_OPS
 
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 41c6251ac7..eb590280c9 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
@@ -46,6 +47,7 @@ limitations under the License.
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
 
 const int kInvalidMaxPoolingIndex = -1;
 
@@ -187,40 +189,6 @@ static void SpatialMaxPoolWithArgMaxHelper(
         params.tensor_in_batch, shard_cost, shard);
 }
 
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    MaxPoolingOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
-    MaxPoolingOp<CPUDevice, Eigen::half>);
-
-#if GOOGLE_CUDA
-// Forward declarations for the functor specializations for GPU.
-namespace functor {
-#define DECLARE_GPU_SPEC(T)                                            \
-  template <>                                                          \
-  void SpatialMaxPooling<Eigen::GpuDevice, T>::operator()(             \
-      const Eigen::GpuDevice& d, typename TTypes<T, 4>::Tensor output, \
-      typename TTypes<T, 4>::ConstTensor input, int window_rows,       \
-      int window_cols, int row_stride, int col_stride,                 \
-      const Eigen::PaddingType& padding);                              \
-  extern template struct SpatialMaxPooling<Eigen::GpuDevice, T>;
-
-DECLARE_GPU_SPEC(float);
-#undef DECLARE_GPU_SPEC
-}  // namespace functor
-
-// Note(jiayq): Currently, the Caffe custom implementation is faster than the
-// default Eigen implementation so we are using the custom kernel as the
-// default. However, you can explicitly invoke the eigen version using
-// kernel_label_map.
-REGISTER_KERNEL_BUILDER(Name("MaxPool")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<float>("T")
-                            .Label("eigen_tensor"),
-                        MaxPoolingOp<Eigen::GpuDevice, float>);
-#endif  // GOOGLE_CUDA
-
 // The operation to compute MaxPool gradients.
 // It takes three inputs:
 //   - The original input tensor
@@ -237,7 +205,7 @@ class MaxPoolingGradOp : public OpKernel {
                 errors::InvalidArgument("Invalid data format"));
     OP_REQUIRES(
         context, data_format_ == FORMAT_NHWC,
-        errors::InvalidArgument("Default MaxPoolinGradOp only supports NHWC ",
+        errors::InvalidArgument("Default MaxPoolingGradOp only supports NHWC ",
                                 "on device type ",
                                 DeviceTypeString(context->device_type())));
     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
@@ -305,13 +273,6 @@ class MaxPoolingGradOp : public OpKernel {
   TensorFormat data_format_;
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPoolGrad").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    MaxPoolingGradOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPoolGrad").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
-    MaxPoolingGradOp<CPUDevice, Eigen::half>);
-
 #ifdef GOOGLE_CUDA
 
 template <typename T>
@@ -329,13 +290,13 @@ static void MaxPoolingBackwardCustomKernel(
     return;
   }
 
-  MaxPoolBackwardNoMask(
+  functor::MaxPoolBackwardNoMask<T>()(
       tensor_in->flat<T>().data(), params.tensor_in_batch,
       params.tensor_in_rows, params.tensor_in_cols, params.depth,
       params.out_height, params.out_width, params.window_rows,
       params.window_cols, params.row_stride, params.col_stride, params.pad_rows,
-      params.pad_cols, out_backprop.flat<T>().data(),
-      output->flat<T>().data(), context->eigen_device<Eigen::GpuDevice>());
+      params.pad_cols, out_backprop.flat<T>().data(), output->flat<T>().data(),
+      context->eigen_device<Eigen::GpuDevice>());
 }
 
 template <class T>
@@ -403,12 +364,252 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
   bool use_dnn_;
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPoolGrad").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    MaxPoolingGradOp<Eigen::GpuDevice, float>);
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPoolGrad").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
-    MaxPoolingGradOp<Eigen::GpuDevice, Eigen::half>);
+#endif  // GOOGLE_CUDA
+
+// The operation to compute gradient of MaxPool gradients.
+// It takes three inputs:
+//   - The original input tensor
+//   - The original output tensor
+//   - Backprop tensor for output gradients
+// It produces one output: backprop tensor for output gradient.
+template <class Device, class T>
+class MaxPoolingGradGradOp : public OpKernel {
+ public:
+  explicit MaxPoolingGradGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES(
+        context, data_format_ == FORMAT_NHWC,
+        errors::InvalidArgument(
+            "Default MaxPoolingGradGradOp only supports NHWC ",
+            "on device type ", DeviceTypeString(context->device_type())));
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+    OP_REQUIRES(
+        context, ksize_[3] == 1 && stride_[3] == 1,
+        errors::Unimplemented(
+            "MaxPoolingGradGrad is not yet supported on the depth dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+    const Tensor& tensor_out = context->input(1);
+    const Tensor& out_grad_backprop = context->input(2);
+
+    // For maxpooling, tensor_in should have 4 dimensions.
+    OP_REQUIRES(context, tensor_in.dims() == 4,
+                errors::InvalidArgument("tensor_in must be 4-dimensional"));
+    OP_REQUIRES(context, tensor_out.dims() == 4,
+                errors::InvalidArgument("tensor_out must be 4-dimensional"));
+    // For maxpooling, out_grad_backprop should have 4 dimensions.
+    OP_REQUIRES(
+        context, out_grad_backprop.dims() == 4,
+        errors::InvalidArgument("out_grad_backprop must be 4-dimensional"));
+
+    PoolParameters params{context,  ksize_,      stride_,
+                          padding_, FORMAT_NHWC, tensor_in.shape()};
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {2}, 0, tensor_out.shape(), &output));
+
+    SpatialMaxPoolGradGrad(context, output, tensor_in, tensor_out,
+                           out_grad_backprop, params, padding_);
+  }
+
+ private:
+  void SpatialMaxPoolGradGrad(OpKernelContext* context, Tensor* bottom_diff,
+                              const Tensor& tensor_in, const Tensor& tensor_out,
+                              const Tensor& top_diff,
+                              const PoolParameters& params,
+                              const Padding& padding) {
+    typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+        ConstEigenMatrixMap;
+    typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+        EigenMatrixMap;
+
+    ConstEigenMatrixMap in_mat(
+        tensor_in.flat<T>().data(), params.depth,
+        params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
+    ConstEigenMatrixMap out_mat(
+        tensor_out.flat<T>().data(), params.depth,
+        params.out_width * params.out_height * params.tensor_in_batch);
+    ConstEigenMatrixMap top_diff_mat(
+        top_diff.flat<T>().data(), params.depth,
+        params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
+    EigenMatrixMap bottom_diff_mat(
+        bottom_diff->flat<T>().data(), params.depth,
+        params.out_width * params.out_height * params.tensor_in_batch);
+
+    const DeviceBase::CpuWorkerThreads& worker_threads =
+        *(context->device()->tensorflow_cpu_worker_threads());
+
+    // The following code basically does the following:
+    // 1. Flattens the input, output, top_diff and bottom_diff tensors into
+    //    two dimensional arrays.
+    //    tensor_in_as_matrix:
+    //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
+    //    tensor_out_as_matrix:
+    //      depth by (out_width * out_height * tensor_in_batch)
+    //    top_diff_as_matrix:
+    //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
+    //    bottom_diff_as_matrix:
+    //      depth by (out_width * out_height * tensor_in_batch)
+    //
+    // 2. Walks through the set of columns in the flattened
+    //    tensor_in_as_matrix, tensor_out_as_matrix, top_diff_as_matrix
+    //    and updates the column(s) corresponding to the maximum values in
+    //    tensor_out_as_matrix with the corresponding values in
+    //    top_diff_as_matrix.
+    auto shard = [&params, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
+                     int64 start, int64 limit) {
+      const int32 depth = params.depth;
+      const int32 in_rows = params.tensor_in_rows;
+      const int32 in_cols = params.tensor_in_cols;
+      const int32 pad_rows = params.pad_rows;
+      const int32 pad_cols = params.pad_cols;
+      const int32 window_rows = params.window_rows;
+      const int32 window_cols = params.window_cols;
+      const int32 row_stride = params.row_stride;
+      const int32 col_stride = params.col_stride;
+      const int32 out_height = params.out_height;
+      const int32 out_width = params.out_width;
+
+      {
+        // Initializes the output grad backprop tensor with 0.
+        const int32 output_image_size = out_height * out_width * params.depth;
+        EigenMatrixMap bottom_diff_shard(
+            bottom_diff_mat.data() + start * output_image_size, 1,
+            (limit - start) * output_image_size);
+        bottom_diff_shard.setZero();
+      }
+
+      for (int b = start; b < limit; ++b) {
+        for (int ph = 0; ph < out_height; ++ph) {
+          for (int pw = 0; pw < out_width; ++pw) {
+            // (h_start, h_end) * (w_start, w_end) is the range that the input
+            // vector projects to.
+            int h_start = ph * row_stride - pad_rows;
+            const int h_end = std::min(h_start + window_rows, in_rows);
+            int w_start = pw * col_stride - pad_cols;
+            const int w_end = std::min(w_start + window_cols, in_cols);
+            h_start = std::max(h_start, 0);
+            w_start = std::max(w_start, 0);
+            const int out_index = (b * out_height + ph) * out_width + pw;
+            // Find value corresponding to the input maximum in top_diff.
+            for (int d = 0; d < depth; ++d) {
+              const T& output_ref = out_mat.coeffRef(d, out_index);
+              bool should_stop = false;
+              for (int h = h_start; h < h_end && !should_stop; ++h) {
+                for (int w = w_start; w < w_end && !should_stop; ++w) {
+                  const int in_index = (b * in_rows + h) * in_cols + w;
+                  const T& input_ref = in_mat.coeffRef(d, in_index);
+                  if (output_ref == input_ref) {
+                    T& bottom_diff_ref = bottom_diff_mat.coeffRef(d, out_index);
+                    bottom_diff_ref = top_diff_mat.coeffRef(d, in_index);
+                    should_stop = true;
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    };
+
+    const int64 shard_cost = params.out_width * params.out_height *
+                             params.depth * params.window_rows *
+                             params.window_cols;
+    Shard(worker_threads.num_threads, worker_threads.workers,
+          params.tensor_in_batch, shard_cost, shard);
+  }
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+};
+
+#ifdef GOOGLE_CUDA
+
+template <class T>
+class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
+ public:
+  typedef Eigen::GpuDevice Device;
+
+  explicit MaxPoolingGradGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
+    const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
+    OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+    const Tensor& tensor_out = context->input(1);
+    const Tensor& out_grad_backprop = context->input(2);
+
+    // For maxpooling, tensor_in should have 4 dimensions.
+    OP_REQUIRES(context, tensor_in.dims() == 4,
+                errors::InvalidArgument("tensor_in must be 4-dimensional 4"));
+    OP_REQUIRES(context, tensor_out.dims() == 4,
+                errors::InvalidArgument("tensor_out must be 4-dimensional"));
+    // For maxpooling, out_grad_backprop should have 4 dimensions.
+    OP_REQUIRES(
+        context, out_grad_backprop.dims() == 4,
+        errors::InvalidArgument("out_grad_backprop must be 4-dimensional"));
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {2}, 0, tensor_out.shape(), &output));
+
+    PoolParameters params{context,  ksize_,       stride_,
+                          padding_, data_format_, tensor_in.shape()};
+
+    functor::MaxPoolGradBackwardNoMask<T>()(
+        data_format_, tensor_in.flat<T>().data(), tensor_out.flat<T>().data(),
+        params.tensor_in_batch, params.out_height, params.out_width,
+        params.depth, params.tensor_in_rows, params.tensor_in_cols,
+        params.window_rows, params.window_cols, params.row_stride,
+        params.col_stride, params.pad_rows, params.pad_cols,
+        out_grad_backprop.flat<T>().data(), output->flat<T>().data(),
+        context->eigen_device<Eigen::GpuDevice>());
+  }
+
+ private:
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+  bool use_dnn_;
+};
 
 #endif  // GOOGLE_CUDA
 
@@ -565,6 +766,56 @@ class MaxPoolingGradWithArgmaxOp : public OpKernel {
   Padding padding_;
 };
 
+template <typename Device, typename T>
+struct LaunchMaxPoolingGradGradWithArgmax;
+
+template <typename Device, typename T>
+class MaxPoolingGradGradWithArgmaxOp : public OpKernel {
+ public:
+  explicit MaxPoolingGradGradWithArgmaxOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window stride field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+    const Tensor& grad_in = context->input(1);
+    const Tensor& argmax = context->input(2);
+
+    PoolParameters params{context,  ksize_,      stride_,
+                          padding_, FORMAT_NHWC, tensor_in.shape()};
+    if (!context->status().ok()) {
+      return;
+    }
+
+    TensorShape out_shape({params.tensor_in_batch, params.out_height,
+                           params.out_width, params.depth});
+
+    Tensor* grad_out = nullptr;
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {1}, 0, out_shape, &grad_out));
+
+    LaunchMaxPoolingGradGradWithArgmax<Device, T>::launch(
+        context, params, grad_in, argmax, grad_out);
+  }
+
+ private:
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+};
+
 #if GOOGLE_CUDA
 template <typename T>
 class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
@@ -631,7 +882,7 @@ template <typename T>
 struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
   static void launch(OpKernelContext* context, const PoolParameters& params,
                      const Tensor& input, Tensor* output) {
-    bool status = MaxPoolForwardWithOptionalArgmax(
+    bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()(
         input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
         params.tensor_in_cols, params.depth, params.out_height,
         params.out_width, params.window_rows, params.window_cols,
@@ -644,18 +895,11 @@ struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
   }
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    MaxPoolingNoMaskOp<Eigen::GpuDevice, float>);
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
-    MaxPoolingNoMaskOp<Eigen::GpuDevice, Eigen::half>);
-
 template <typename T>
 struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
   static void launch(OpKernelContext* context, const PoolParameters& params,
                      const Tensor& input, Tensor* output, Tensor* argmax) {
-    bool status = MaxPoolForwardWithOptionalArgmax(
+    bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()(
         input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
         params.tensor_in_cols, params.depth, params.out_height,
         params.out_width, params.window_rows, params.window_cols,
@@ -670,17 +914,6 @@ struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<int64>("Targmax")
-                            .TypeConstraint<float>("T"),
-                        MaxPoolingWithArgmaxOp<Eigen::GpuDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<int64>("Targmax")
-                            .TypeConstraint<Eigen::half>("T"),
-                        MaxPoolingWithArgmaxOp<Eigen::GpuDevice, Eigen::half>);
-
 template <typename T>
 struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> {
   static void launch(OpKernelContext* context, const PoolParameters& params,
@@ -693,30 +926,118 @@ struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> {
     const int top_offset = params.out_height * params.out_width * params.depth;
     const int bottom_offset =
         params.tensor_in_rows * params.tensor_in_cols * params.depth;
-    bool status = MaxPoolBackwardWithArgmax(
+    bool status = functor::MaxPoolBackwardWithArgmax<T>()(
         output_size, input_size, grad_in.flat<T>().data(),
         reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset,
         bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device());
     if (!status) {
       context->SetStatus(
-          errors::Internal("Failed launching MaxPoolForwardWithArgmax"));
+          errors::Internal("Failed launching MaxPoolBackwardWithArgmax"));
     }
   }
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPoolGradWithArgmax")
-        .Device(DEVICE_GPU)
-        .TypeConstraint<float>("T")
-        .TypeConstraint<int64>("Targmax"),
-    MaxPoolingGradWithArgmaxOp<Eigen::GpuDevice, float>);
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPoolGradWithArgmax")
-        .Device(DEVICE_GPU)
-        .TypeConstraint<Eigen::half>("T")
-        .TypeConstraint<int64>("Targmax"),
-    MaxPoolingGradWithArgmaxOp<Eigen::GpuDevice, Eigen::half>);
+template <typename T>
+struct LaunchMaxPoolingGradGradWithArgmax<Eigen::GpuDevice, T> {
+  static void launch(OpKernelContext* context, const PoolParameters& params,
+                     const Tensor& grad_in, const Tensor& argmax,
+                     Tensor* grad_out) {
+    const int input_size = params.tensor_in_batch * params.tensor_in_rows *
+                           params.tensor_in_cols * params.depth;
+    const int output_size = params.tensor_in_batch * params.out_height *
+                            params.out_width * params.depth;
+    const int top_offset =
+        params.tensor_in_rows * params.tensor_in_cols * params.depth;
+    const int bottom_offset =
+        params.out_width * params.out_height * params.depth;
+    bool status = functor::MaxPoolGradBackwardWithArgmax<T>()(
+        output_size, input_size, grad_in.flat<T>().data(),
+        reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset,
+        bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device());
+    if (!status) {
+      context->SetStatus(
+          errors::Internal("Failed launching MaxPoolGradBackwardWithArgmax"));
+    }
+  }
+};
 
 #endif  // GOOGLE_CUDA
 
+#define REGISTER_MAX_POOL_KERNELS(D, T)                                  \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("MaxPoolGrad").Device(DEVICE_##D).TypeConstraint<T>("T"),     \
+      MaxPoolingGradOp<D##Device, T>);                                   \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("MaxPoolGradGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      MaxPoolingGradGradOp<D##Device, T>);
+
+// Below kernels implemented only for CPU device.
+#define REGISTER_CPU_ONLY_POOL_KERNELS(T)                        \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      MaxPoolingOp<CPUDevice, T>);
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_ONLY_POOL_KERNELS);
+#undef REGISTER_CPU_ONLY_POOL_KERNELS
+
+#define REGISTER_CPU_MAX_POOL_KERNELS(T) REGISTER_MAX_POOL_KERNELS(CPU, T);
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_MAX_POOL_KERNELS);
+#undef REGISTER_CPU_KERNELS
+
+#if GOOGLE_CUDA
+
+// Forward declarations for the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                            \
+  template <>                                                          \
+  void SpatialMaxPooling<Eigen::GpuDevice, T>::operator()(             \
+      const Eigen::GpuDevice& d, typename TTypes<T, 4>::Tensor output, \
+      typename TTypes<T, 4>::ConstTensor input, int window_rows,       \
+      int window_cols, int row_stride, int col_stride,                 \
+      const Eigen::PaddingType& padding);                              \
+  extern template struct SpatialMaxPooling<Eigen::GpuDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+#define REGISTER_GPU_MAX_POOL_KERNELS(T) REGISTER_MAX_POOL_KERNELS(GPU, T)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_MAX_POOL_KERNELS);
+#undef REGISTER_GPU_MAX_POOL_KERNELS
+
+// Below kernels currently implemented only for GPU device.
+// Note(jiayq): Currently, the Caffe custom implementation is faster than the
+// default Eigen implementation so we are using the custom kernel as the
+// default. However, you can explicitly invoke the eigen version using
+// kernel_label_map.
+#define REGISTER_GPU_ONLY_POOL_KERNELS(T)                            \
+  REGISTER_KERNEL_BUILDER(Name("MaxPool")                            \
+                              .Device(DEVICE_GPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .Label("eigen_tensor"),                \
+                          MaxPoolingOp<GPUDevice, T>);               \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<T>("T"),     \
+      MaxPoolingNoMaskOp<GPUDevice, T>);                             \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")                  \
+                              .Device(DEVICE_GPU)                    \
+                              .TypeConstraint<int64>("Targmax")      \
+                              .TypeConstraint<T>("T"),               \
+                          MaxPoolingWithArgmaxOp<GPUDevice, T>);     \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax")              \
+                              .Device(DEVICE_GPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<int64>("Targmax"),     \
+                          MaxPoolingGradWithArgmaxOp<GPUDevice, T>); \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradWithArgmax")          \
+                              .Device(DEVICE_GPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<int64>("Targmax"),     \
+                          MaxPoolingGradGradWithArgmaxOp<GPUDevice, T>);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_ONLY_POOL_KERNELS);
+#undef REGISTER_GPU_ONLY_POOL_KERNELS
+
+#endif  // GOOGLE_CUDA
+
+#undef REGISTER_MAX_POOL_KERNELS
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index 91b50b1e11..0c638ca233 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -199,32 +199,145 @@ __global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff,
   }
 }
 
-#undef CUDA_1D_KERNEL_LOOP
-}  // namespace
+// The parameters to the kernels in the gradient gradient function is as
+// follows:
+//     nthreads: the number of threads, which is equal to the output size. The
+//         gradient of the MaxPooling gradient w.r.t. the output data has a
+//         dimensions of N*C*Hout*Wout
+//     bottom_data: the bottom data of N*H*W*C (or N*C*H*W) items.
+//     output_data: the output data of N*Hout*Wout*C (or N*C*Hout*Wout) items.
+//     height, width, pooled_height, pooled_width: the input and output sizes.
+//     kernel_h, kernel_w: the kernel sizes.
+//     stride_h, stride_w: the strides.
+//     pad_t, pad_l: the padding values on the top and left side.
+//     top_diff: the gradient of the gradient of the output data w.r.t. the
+//         input data, of size N*H*W*C (or N*C*H*W).
+//     bottom_diff: the gradient of the gradient w.r.t. output.
+template <typename dtype>
+__global__ void MaxPoolGradBackwardNoMaskNCHW(
+    const int nthreads, const dtype* bottom_data, const dtype* output_data,
+    const int pooled_height, const int pooled_width, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+    const dtype* top_diff, dtype* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // First find out the index to the maximum, since we have no mask.
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_t;
+    int wstart = pw * stride_w - pad_l;
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    bool should_stop = false;
+    int maxidx = -1;
+    const dtype* bottom_data_n = bottom_data + n * channels * height * width;
+    // Propagate only first value from top_diff corresponding to the maximum.
+    for (int h = hstart; h < hend && !should_stop; ++h) {
+      for (int w = wstart; w < wend && !should_stop; ++w) {
+        int idx = c * height * width + h * width + w;
+        if (output_data[index] == bottom_data_n[idx]) {
+          maxidx = idx;
+          should_stop = true;
+        }
+      }
+    }
+    // Set the bottom diff (atomic is not necessary). The index could still be
+    // uninitialized, if all the bottom_data are NaN.
+    if (maxidx != -1) {
+      bottom_diff[index] = top_diff[n * channels * height * width + maxidx];
+    }
+  }
+}
 
-bool MaxPoolForwardWithOptionalArgmax(
-    const float* bottom_data, const int batch, const int height,
-    const int width, const int channels, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
+template <typename dtype>
+__global__ void MaxPoolGradBackwardNoMaskNHWC(
+    const int nthreads, const dtype* bottom_data, const dtype* output_data,
+    const int pooled_height, const int pooled_width, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
     const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    float* top_data, int64* mask, const Eigen::GpuDevice& d) {
-  const int kThreadsPerBlock = 1024;
-  const int output_size = batch * channels * pooled_height * pooled_width;
+    const dtype* top_diff, dtype* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // First find out the index to the maximum, since we have no mask.
+    int n = index;
+    int c = n % channels;
+    n /= channels;
+    int wstart = (n % pooled_width) * stride_w - pad_l;
+    n /= pooled_width;
+    int hstart = (n % pooled_height) * stride_h - pad_t;
+    n /= pooled_height;
+    int hend = min(hstart + kernel_h, height);
+    int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    bool should_stop = false;
+    int maxidx = -1;
+    const dtype* bottom_data_n = bottom_data + n * height * width * channels;
+    // Propagate only first value from top_diff corresponding to the maximum.
+    for (int h = hstart; h < hend && !should_stop; ++h) {
+      for (int w = wstart; w < wend && !should_stop; ++w) {
+        int idx = (h * width + w) * channels + c;
+        if (output_data[index] == bottom_data_n[idx]) {
+          maxidx = idx;
+          should_stop = true;
+        }
+      }
+    }
+    // Set the bottom diff (atomic is not necessary). The index could still be
+    // uninitialized, if all the bottom_data are NaN.
+    if (maxidx != -1) {
+      bottom_diff[index] = top_diff[n * height * width * channels + maxidx];
+    }
+  }
+}
 
-  MaxPoolForwardNHWC<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-                       kThreadsPerBlock, 0, d.stream()>>>(
-      output_size, bottom_data, height, width, channels, pooled_height,
-      pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
-      top_data, mask);
-  return d.ok();
+// The parameters to the kernels in the gradient gradient function is as
+// follows:
+//     nthreads: the number of threads, which is equal to the output size. The
+//         gradient of the MaxPooling gradient w.r.t. the output data has a
+//         dimensions of N*C*Hout*Wout
+//     top_diff: the gradient of the gradient of the output data w.r.t. the
+//         input data, of size N*H*W*C (or N*C*H*W). As we have stored the
+//         flattened index of the input entries, the backward function is
+//         agnostic of the input storage order.
+//     mask: the output mask of the same size as top_data. It is stored in
+//         int form, keeping track of the flattened index of the input item that
+//         produces the max output.
+//     top_offset: the pre-computed per-image offset of the maxpool input
+//         gradient. This is equal to H*W*C. We choose to pre-compute this so we
+//         do not  need to compute it every time inside the kernel.
+//     bottom_offset: the pre-computed per-image offset of the maxpool output.
+//         This is equal to Hout*Wout*C.
+//     bottom_diff: the gradient of the gradient w.r.t. output.
+// This function relies on CudaAtomicAdd to avoid race conditions. Also, before
+// the kernel is run, you will need to make sure that bottom_diff is filled with
+// zero first.
+template <typename dtype>
+__global__ void MaxPoolGradBackward(const int nthreads, const dtype* top_diff,
+                                    const int64* mask, const int top_offset,
+                                    const int bottom_offset,
+                                    dtype* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int image_id = (index / bottom_offset);
+    bottom_diff[index] = top_diff[image_id * top_offset + mask[index]];
+  }
 }
 
-bool MaxPoolForwardWithOptionalArgmax(
-    const Eigen::half* bottom_data, const int batch, const int height,
-    const int width, const int channels, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    Eigen::half* top_data, int64* mask, const Eigen::GpuDevice& d) {
+#undef CUDA_1D_KERNEL_LOOP
+}  // namespace
+
+namespace functor {
+
+template <typename T>
+bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
+    const T* bottom_data, const int batch, const int height, const int width,
+    const int channels, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_t, const int pad_l, T* top_data,
+    int64* mask, const Eigen::GpuDevice& d) {
   const int kThreadsPerBlock = 1024;
   const int output_size = batch * channels * pooled_height * pooled_width;
 
@@ -236,14 +349,13 @@ bool MaxPoolForwardWithOptionalArgmax(
   return d.ok();
 }
 
-bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
-                           const int height, const int width,
-                           const int channels, const int pooled_height,
-                           const int pooled_width, const int kernel_h,
-                           const int kernel_w, const int stride_h,
-                           const int stride_w, const int pad_t, const int pad_l,
-                           const float* top_diff, float* bottom_diff,
-                           const Eigen::GpuDevice& d) {
+template <typename T>
+bool MaxPoolBackwardNoMask<T>::operator()(
+    const T* bottom_data, const int batch, const int height, const int width,
+    const int channels, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_t, const int pad_l, const T* top_diff,
+    T* bottom_diff, const Eigen::GpuDevice& d) {
   const int kThreadsPerBlock = 1024;
   const int bottom_size = batch * channels * height * width;
   const int top_size = batch * channels * pooled_height * pooled_width;
@@ -260,34 +372,11 @@ bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
   return d.ok();
 }
 
-bool MaxPoolBackwardNoMask(const Eigen::half* bottom_data, const int batch,
-                           const int height, const int width,
-                           const int channels, const int pooled_height,
-                           const int pooled_width, const int kernel_h,
-                           const int kernel_w, const int stride_h,
-                           const int stride_w, const int pad_t, const int pad_l,
-                           const Eigen::half* top_diff, Eigen::half* bottom_diff,
-                           const Eigen::GpuDevice& d) {
-  const int kThreadsPerBlock = 1024;
-  const int bottom_size = batch * channels * height * width;
-  const int top_size = batch * channels * pooled_height * pooled_width;
-
-  SetZero<<<(bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-            kThreadsPerBlock, 0, d.stream()>>>(bottom_size, bottom_diff);
-
-  MaxPoolBackwardNoMaskNHWC<<<(top_size + kThreadsPerBlock - 1) /
-                                  kThreadsPerBlock,
-                              kThreadsPerBlock, 0, d.stream()>>>(
-      top_size, bottom_data, height, width, channels, pooled_height,
-      pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
-      top_diff, bottom_diff);
-  return d.ok();
-}
-
-bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
-                               const float* top_diff, const int64* mask,
-                               const int top_offset, const int bottom_offset,
-                               float* bottom_diff, const Eigen::GpuDevice& d) {
+template <typename T>
+bool MaxPoolBackwardWithArgmax<T>::operator()(
+    const int output_size, const int input_size, const T* top_diff,
+    const int64* mask, const int top_offset, const int bottom_offset,
+    T* bottom_diff, const Eigen::GpuDevice& d) {
   const int kThreadsPerBlock = 1024;
   SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
             kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
@@ -297,30 +386,61 @@ bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
   return d.ok();
 }
 
-bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
-                               const Eigen::half* top_diff, const int64* mask,
-                               const int top_offset, const int bottom_offset,
-                               Eigen::half* bottom_diff,
-                               const Eigen::GpuDevice& d) {
-  const int kThreadsPerBlock = 1024;
-  SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-            kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
-  MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-                    kThreadsPerBlock, 0, d.stream()>>>(
-      output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
+template <typename T>
+bool MaxPoolGradBackwardNoMask<T>::operator()(
+    TensorFormat data_format, const T* bottom_data, const T* output_data,
+    const int batch, const int pooled_height, const int pooled_width,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_t,
+    const int pad_l, const T* top_diff, T* bottom_diff,
+    const Eigen::GpuDevice& d) {
+  const int num_kernels = batch * channels * pooled_height * pooled_width;
+  CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
+
+  if (data_format == FORMAT_NHWC) {
+    MaxPoolGradBackwardNoMaskNHWC<<<config.block_count, config.thread_per_block,
+                                    0, d.stream()>>>(
+        num_kernels, bottom_data, output_data, pooled_height, pooled_width,
+        channels, height, width, kernel_h, kernel_w, stride_h, stride_w, pad_t,
+        pad_l, top_diff, bottom_diff);
+  } else {
+    MaxPoolGradBackwardNoMaskNCHW<<<config.block_count, config.thread_per_block,
+                                    0, d.stream()>>>(
+        num_kernels, bottom_data, output_data, pooled_height, pooled_width,
+        channels, height, width, kernel_h, kernel_w, stride_h, stride_w, pad_t,
+        pad_l, top_diff, bottom_diff);
+  }
+  return d.ok();
+}
+
+template <typename T>
+bool MaxPoolGradBackwardWithArgmax<T>::operator()(
+    const int output_size, const int input_size, const T* top_diff,
+    const int64* mask, const int top_offset, const int bottom_offset,
+    T* bottom_diff, const Eigen::GpuDevice& d) {
+  CudaLaunchConfig config = GetCudaLaunchConfig(output_size, d);
+  MaxPoolGradBackward<<<config.block_count, config.thread_per_block, 0,
+                        d.stream()>>>(output_size, top_diff, mask, top_offset,
+                                      bottom_offset, bottom_diff);
   return d.ok();
 }
 
 typedef Eigen::GpuDevice GPUDevice;
 
-#define DEFINE_GPU_KERNELS(T) \
-  template struct functor::SpatialMaxPooling<GPUDevice, T>;
+#define DEFINE_GPU_KERNELS(T)                          \
+  template struct SpatialMaxPooling<GPUDevice, T>;     \
+  template struct MaxPoolForwardWithOptionalArgmax<T>; \
+  template struct MaxPoolBackwardWithArgmax<T>;        \
+  template struct MaxPoolBackwardNoMask<T>;            \
+  template struct MaxPoolGradBackwardWithArgmax<T>;    \
+  template struct MaxPoolGradBackwardNoMask<T>;
 
-DEFINE_GPU_KERNELS(float)
-DEFINE_GPU_KERNELS(Eigen::half)
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 
 #undef DEFINE_GPU_KERNELS
 
+}  // namespace functor
+
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.h b/tensorflow/core/kernels/maxpooling_op_gpu.h
index d1c73a372e..d2029f5719 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.h
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.h
@@ -24,54 +24,62 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
 
+namespace functor {
 // Run the forward pass of max pooling, optionally writing the argmax indices to
 // the mask array, if it is not nullptr. If mask is passed in as nullptr, the
 // argmax indices are not written.
-bool MaxPoolForwardWithOptionalArgmax(
-    const float* bottom_data, const int batch, const int height,
-    const int width, const int channels, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    float* top_data, int64* mask, const Eigen::GpuDevice& d);
-
-bool MaxPoolForwardWithOptionalArgmax(
-    const Eigen::half* bottom_data, const int batch, const int height,
-    const int width, const int channels, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    Eigen::half* top_data, int64* mask, const Eigen::GpuDevice& d);
-
-bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
-                               const float* top_diff, const int64* mask,
-                               const int top_offset, const int bottom_offset,
-                               float* bottom_diff, const Eigen::GpuDevice& d);
-
-bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
-                               const Eigen::half* top_diff, const int64* mask,
-                               const int top_offset, const int bottom_offset,
-                               Eigen::half* bottom_diff,
-                               const Eigen::GpuDevice& d);
-
-bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
-                           const int height, const int width,
-                           const int channels, const int pooled_height,
-                           const int pooled_width, const int kernel_h,
-                           const int kernel_w, const int stride_h,
-                           const int stride_w, const int pad_t, const int pad_l,
-                           const float* top_diff, float* bottom_diff,
-                           const Eigen::GpuDevice& d);
-
-bool MaxPoolBackwardNoMask(const Eigen::half* bottom_data, const int batch,
-                           const int height, const int width,
-                           const int channels, const int pooled_height,
-                           const int pooled_width, const int kernel_h,
-                           const int kernel_w, const int stride_h,
-                           const int stride_w, const int pad_t, const int pad_l,
-                           const Eigen::half* top_diff, Eigen::half* bottom_diff,
-                           const Eigen::GpuDevice& d);
+template <typename T>
+struct MaxPoolForwardWithOptionalArgmax {
+  bool operator()(const T* bottom_data, const int batch, const int height,
+                  const int width, const int channels, const int pooled_height,
+                  const int pooled_width, const int kernel_h,
+                  const int kernel_w, const int stride_h, const int stride_w,
+                  const int pad_t, const int pad_l, T* top_data, int64* mask,
+                  const Eigen::GpuDevice& d);
+};
+
+template <typename T>
+struct MaxPoolBackwardWithArgmax {
+  bool operator()(const int output_size, const int input_size,
+                  const T* top_diff, const int64* mask, const int top_offset,
+                  const int bottom_offset, T* bottom_diff,
+                  const Eigen::GpuDevice& d);
+};
+
+template <typename T>
+struct MaxPoolBackwardNoMask {
+  bool operator()(const T* bottom_data, const int batch, const int height,
+                  const int width, const int channels, const int pooled_height,
+                  const int pooled_width, const int kernel_h,
+                  const int kernel_w, const int stride_h, const int stride_w,
+                  const int pad_t, const int pad_l, const T* top_diff,
+                  T* bottom_diff, const Eigen::GpuDevice& d);
+};
+
+template <typename T>
+struct MaxPoolGradBackwardWithArgmax {
+  bool operator()(const int output_size, const int input_size,
+                  const T* top_diff, const int64* mask, const int top_offset,
+                  const int bottom_offset, T* bottom_diff,
+                  const Eigen::GpuDevice& d);
+};
+
+template <typename T>
+struct MaxPoolGradBackwardNoMask {
+  bool operator()(TensorFormat data_format, const T* bottom_data,
+                  const T* output_data, const int batch,
+                  const int pooled_height, const int pooled_width,
+                  const int channels, const int height, const int width,
+                  const int kernel_h, const int kernel_w, const int stride_h,
+                  const int stride_w, const int pad_t, const int pad_l,
+                  const T* top_diff, T* bottom_diff, const Eigen::GpuDevice& d);
+};
+
+}  // namespace functor
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc
new file mode 100644
index 0000000000..71918fe269
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc
@@ -0,0 +1,428 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+   ==============================================================================*/
+
+#ifdef INTEL_MKL
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+#include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename T>
+class MklAvgPoolingOp : public UnaryOp<T> {
+ public:
+  explicit MklAvgPoolingOp(OpKernelConstruction* context)
+      : UnaryOp<T>(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window stride field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented("Pooling is not yet supported on the "
+                                      "batch dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklAvgPoolingOpContext mkl_context;
+    const Tensor& tensor_in = MklGetInput(context, 0);
+    GetMklShape(context, 0, &mkl_context.input_shape);
+    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+
+    if (!input_in_mkl_format)
+      mkl_context.params.in_dim = tensor_in.dims();
+    else
+      mkl_context.params.in_dim = mkl_context.input_shape.GetDimension();
+
+    MklPoolParameters pool_params;
+    if (!input_in_mkl_format) {
+      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                       tensor_in.shape());
+    } else {
+      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                       &mkl_context.input_shape);
+    }
+
+    // Extract the parameters for the op from the pooling specs
+    ExtractMklOpParams(context, data_format_, pool_params, &mkl_context.params);
+
+    Tensor mkl_tmp_input_buf_tensor_;
+    mkl_context.MklCreateLayoutsAndPrimitives(context,
+                                              &mkl_tmp_input_buf_tensor_);
+
+    Tensor workspace_tensor;
+    void* workspace_buf;
+    AllocTmpBuffer(context, &workspace_tensor, mkl_context.lt_workspace,
+                   &workspace_buf);
+
+    if (mkl_context.convert_input != nullptr) {
+      if (input_in_mkl_format == false) {
+        CHECK_EQ(
+            dnnConversionExecute_F32(
+                mkl_context.convert_input,
+                static_cast<void*>(const_cast<T*>(tensor_in.flat<T>().data())),
+                mkl_context.input_buf),
+            E_SUCCESS);
+        CHECK_EQ(dnnDelete_F32(mkl_context.convert_input), E_SUCCESS);
+      } else {
+        mkl_context.input_shape.GetConvertedFlatData(
+            mkl_context.lt_prim_input,
+            static_cast<void*>(const_cast<T*>(tensor_in.flat<T>().data())),
+            mkl_context.input_buf);
+      }
+      mkl_context.pooling_res[dnnResourceSrc] = mkl_context.input_buf;
+    } else {
+      mkl_context.pooling_res[dnnResourceSrc] =
+          static_cast<void*>(const_cast<T*>(tensor_in.flat<T>().data()));
+    }
+
+    // Declare output tensor and allocate memory
+    Tensor* output = nullptr;
+    TensorShape tensor_out_shape;
+    MklShape mkl_out_shape;
+    mkl_out_shape.SetMklTensor(true);
+    mkl_out_shape.SetMklLayout(mkl_context.prim_pooling_fwd, dnnResourceDst);
+    mkl_out_shape.SetTfLayout(mkl_context.params.in_dim,
+                              mkl_context.params.out_sizes,
+                              mkl_context.params.out_strides);
+    mkl_out_shape.SetTfDimOrder(mkl_context.params.in_dim, data_format_);
+
+    tensor_out_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                                mkl_out_shape.GetMklLayout())) /
+                            sizeof(T));
+
+    AllocateOutputSetMklshape(context, 0, &output, tensor_out_shape,
+                              mkl_out_shape);
+    mkl_context.pooling_res[dnnResourceDst] =
+        static_cast<void*>(output->flat<T>().data());
+
+    mkl_context.pooling_res[dnnResourceWorkspace] = workspace_buf;
+
+    CHECK_EQ(
+        dnnExecute_F32(mkl_context.prim_pooling_fwd, mkl_context.pooling_res),
+        E_SUCCESS);
+
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    MklPoolingOpParams params;
+    MklShape input_shape;
+    dnnPrimitive_t prim_pooling_fwd, convert_input;
+    dnnLayout_t lt_user_input, lt_prim_input, lt_workspace;
+    void* input_buf;
+    void* pooling_res[dnnResourceNumber];
+
+    void MklCreateLayoutsAndPrimitives(OpKernelContext* context,
+                                       Tensor* mkl_tmp_input_buf_tensor) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+
+      if (!input_in_mkl_format) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_user_input, params.in_dim,
+                                     params.in_sizes, params.in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_user_input = (dnnLayout_t)input_shape.GetCurLayout();
+      }
+
+      dnnAlgorithm_t algorithm = dnnAlgorithmPoolingAvg;
+      dnnPrimitiveAttributes_t primAttr = nullptr;
+
+      // Create DNN primitives
+      CHECK_EQ(dnnPoolingCreateForward_F32(
+                   &prim_pooling_fwd, primAttr, algorithm, lt_user_input,
+                   params.kernel_size, params.kernel_stride, params.in_offset,
+                   dnnBorderZerosAsymm),
+               E_SUCCESS);
+
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                   &lt_prim_input, prim_pooling_fwd, dnnResourceSrc),
+               E_SUCCESS);
+      if (!dnnLayoutCompare_F32(lt_user_input, lt_prim_input)) {
+        CHECK_EQ(dnnConversionCreate_F32(&convert_input, lt_user_input,
+                                         lt_prim_input),
+                 E_SUCCESS);
+
+        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_prim_input,
+                       &input_buf);
+      }
+
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_workspace, prim_pooling_fwd,
+                                                dnnResourceWorkspace),
+               E_SUCCESS);
+    }
+
+    void MklCleanup() {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      if (!input_in_mkl_format) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_user_input), E_SUCCESS);
+      }
+
+      CHECK_EQ(dnnDelete_F32(prim_pooling_fwd), E_SUCCESS);
+      CHECK_EQ(dnnLayoutDelete_F32(lt_prim_input), E_SUCCESS);
+    }
+  } MklAvgPoolingOpContext;
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+};
+
+//-----------------------------------------------------------------------------
+
+template <class Device, class T>
+class MklAvgPoolingGradOp : public OpKernel {
+ public:
+  explicit MklAvgPoolingGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented("Pooling is not yet supported on the "
+                                      "batch dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklAvgPoolingGradOpContext mkl_context;
+    const Tensor& tensor_in_shape = MklGetInput(context, 0);
+    const Tensor& out_backprop = MklGetInput(context, 1);
+    GetMklShape(context, 1, &mkl_context.out_backprop_shape);
+    bool outbackprop_in_mkl_format =
+        mkl_context.out_backprop_shape.IsMklTensor();
+
+    TensorShape output_shape;
+    auto shape_vec = tensor_in_shape.vec<int32>();
+    for (int64 i = 0; i < tensor_in_shape.NumElements(); ++i) {
+      output_shape.AddDim(shape_vec(i));
+    }
+
+    MklPoolParameters pool_params;
+    pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                     output_shape);
+
+    // Extract the parameters for the op from the pooling specs
+    ExtractMklOpParams(context, data_format_, pool_params, &mkl_context.params);
+
+    // Tensors needed to create temporary buffers
+    Tensor outbackprop_buf_tensor;
+    void* outbackprop_buf;
+    mkl_context.MklCreateLayoutsAndPrimitives(context);
+
+    // Check if outbackprop layout requires conversion.
+    if (!dnnLayoutCompare_F32(mkl_context.lt_user_outbackprop,
+                              mkl_context.lt_prim_outbackprop)) {
+      CHECK_EQ(dnnConversionCreate_F32(&mkl_context.convert_outbackprop,
+                                       mkl_context.lt_user_outbackprop,
+                                       mkl_context.lt_prim_outbackprop),
+               E_SUCCESS);
+
+      AllocTmpBuffer(context, &outbackprop_buf_tensor,
+                     mkl_context.lt_prim_outbackprop, &outbackprop_buf);
+
+      if (!outbackprop_in_mkl_format) {
+        CHECK_EQ(dnnConversionExecute_F32(mkl_context.convert_outbackprop,
+                                          static_cast<void*>(const_cast<T*>(
+                                              out_backprop.flat<T>().data())),
+                                          outbackprop_buf),
+                 E_SUCCESS);
+        CHECK_EQ(dnnDelete_F32(mkl_context.convert_outbackprop), E_SUCCESS);
+      } else {
+        mkl_context.out_backprop_shape.GetConvertedFlatData(
+            mkl_context.lt_prim_outbackprop,
+            static_cast<void*>(const_cast<T*>(out_backprop.flat<T>().data())),
+            outbackprop_buf);
+      }
+      mkl_context.pooling_res[dnnResourceDiffDst] = outbackprop_buf;
+    } else {
+      mkl_context.pooling_res[dnnResourceDiffDst] =
+          static_cast<void*>(const_cast<T*>(out_backprop.flat<T>().data()));
+    }
+
+    // Handle workspace requirements.
+    Tensor workspace_buf_tensor;
+    void* workspace_buf;
+    AllocTmpBuffer(context, &workspace_buf_tensor, mkl_context.lt_workspace,
+                   &workspace_buf);
+    mkl_context.pooling_res[dnnResourceWorkspace] = workspace_buf;
+
+    // Handle MKL output tensor setup.
+    Tensor* output = nullptr;
+    TensorShape tensor_out_shape;
+    MklShape mkl_out_shape;
+    mkl_out_shape.SetMklTensor(true);
+    mkl_out_shape.SetMklLayout(mkl_context.prim_pooling_bwd,
+                               dnnResourceDiffSrc);
+    mkl_out_shape.SetTfLayout(mkl_context.params.in_dim,
+                              mkl_context.params.in_sizes,
+                              mkl_context.params.in_strides);
+    mkl_out_shape.SetTfDimOrder(mkl_context.params.in_dim, data_format_);
+
+    tensor_out_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                                mkl_out_shape.GetMklLayout())) /
+                            sizeof(T));
+
+    AllocateOutputSetMklshape(context, 0, &output, tensor_out_shape,
+                              mkl_out_shape);
+
+    // Set output tensor.
+    mkl_context.pooling_res[dnnResourceDiffSrc] =
+        static_cast<void*>(output->flat<T>().data());
+
+    // Execute primitive.
+    CHECK_EQ(
+        dnnExecute_F32(mkl_context.prim_pooling_bwd, mkl_context.pooling_res),
+        E_SUCCESS);
+
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    MklPoolingOpParams params;
+    MklShape out_backprop_shape;
+    dnnPrimitive_t prim_pooling_bwd, convert_outbackprop;
+    void* pooling_res[dnnResourceNumber];
+    dnnLayout_t lt_user_input, lt_user_outbackprop, lt_prim_outbackprop,
+        lt_workspace;
+
+    void MklCreateLayoutsAndPrimitives(OpKernelContext* context) {
+      const Tensor& tensor_in_shape = MklGetInput(context, 0);
+      const Tensor& out_backprop = MklGetInput(context, 1);
+      bool outbackprop_in_mkl_format = out_backprop_shape.IsMklTensor();
+
+      if (!outbackprop_in_mkl_format) {
+        // For avgpooling, tensor_in_shape should have 1 dimension, and 4
+        // elements.
+        OP_REQUIRES(
+            context,
+            tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4,
+            errors::InvalidArgument("original input shape must be "
+                                    "1-dimensional and 4 elements"));
+
+        // For avgpooling, out_backprop should have 4 dimensions.
+        OP_REQUIRES(context, out_backprop.dims() == 4,
+                    errors::InvalidArgument("out_backprop must be "
+                                            "4-dimensional"));
+      } else {
+        // Input in MKL format.
+        OP_REQUIRES(
+            context, out_backprop.dims() == 2,
+            errors::InvalidArgument("out_backprop in MKL format must be "
+                                    "2-dimensional"));
+
+        // For avgpooling, out_backprop should have 4 dimensions.
+        OP_REQUIRES(context, out_backprop_shape.GetDimension() == 4,
+                    errors::InvalidArgument("out_backprop must be "
+                                            "4-dimensional"));
+      }
+
+      // TODO(inteltf): Get outbackprop layout.
+      // Do we need to create layout in every invocation?
+      if (!outbackprop_in_mkl_format) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_user_outbackprop, params.in_dim,
+                                     params.out_sizes, params.out_strides),
+                 E_SUCCESS);
+      } else {
+        lt_user_outbackprop = (dnnLayout_t)out_backprop_shape.GetCurLayout();
+      }
+
+      // Create the backward primitive
+      // Create DNN user layout
+      CHECK_EQ(dnnLayoutCreate_F32(&lt_user_input, params.in_dim,
+                                   params.in_sizes, params.in_strides),
+               E_SUCCESS);
+
+      // Create PoolingBackward primitive
+      dnnAlgorithm_t algorithm = dnnAlgorithmPoolingAvg;
+      dnnPrimitiveAttributes_t primAttr = nullptr;
+      CHECK_EQ(dnnPoolingCreateBackward_F32(
+                   &prim_pooling_bwd, primAttr, algorithm, lt_user_input,
+                   params.kernel_size, params.kernel_stride, params.in_offset,
+                   dnnBorderZerosAsymm),
+               E_SUCCESS);
+
+      // Create expected outbackprop layout from the primitive.
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                   &lt_prim_outbackprop, prim_pooling_bwd, dnnResourceDiffDst),
+               E_SUCCESS);
+
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_workspace, prim_pooling_bwd,
+                                                dnnResourceWorkspace),
+               E_SUCCESS);
+    }
+
+    void MklCleanup() {
+      bool outbackprop_in_mkl_format = out_backprop_shape.IsMklTensor();
+      CHECK_EQ(dnnDelete_F32(prim_pooling_bwd), E_SUCCESS);
+      CHECK_EQ(dnnLayoutDelete_F32(lt_user_input), E_SUCCESS);
+      if (!outbackprop_in_mkl_format) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_user_outbackprop), E_SUCCESS);
+      }
+      CHECK_EQ(dnnLayoutDelete_F32(lt_prim_outbackprop), E_SUCCESS);
+      CHECK_EQ(dnnLayoutDelete_F32(lt_workspace), E_SUCCESS);
+    }
+  } MklAvgPoolingGradOpContext;
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("MklAvgPool")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .Label(mkl_layer_registry::kMklLayerLabel),
+                        MklAvgPoolingOp<CPUDevice, float>);
+
+REGISTER_KERNEL_BUILDER(Name("MklAvgPoolGrad")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .Label(mkl_layer_registry::kMklLayerLabel),
+                        MklAvgPoolingGradOp<CPUDevice, float>);
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
new file mode 100644
index 0000000000..627fd83b0d
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
@@ -0,0 +1,264 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.This opkernel uses MKL library, create MKL
+// layout and primitives, use MKL dnn primitives to compute convolution backward
+// bias.
+
+#ifdef INTEL_MKL
+
+#define USE_EIGEN_TENSOR
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/use_cudnn.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, class T>
+class MklConv2DCustomBackpropBiasOp : public OpKernel {
+ public:
+  explicit MklConv2DCustomBackpropBiasOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+  }
+  ~MklConv2DCustomBackpropBiasOp() {}
+
+  void Compute(OpKernelContext* context) override {
+    MklConvBackBiasOpContext mkl_context;
+    const Tensor& input = MklGetInput(context, 0);
+    GetMklShape(context, 0, &mkl_context.input_shape);
+    bool input_is_mkl = mkl_context.input_shape.IsMklTensor();
+
+    if (input_is_mkl) {
+      OP_REQUIRES(
+          context, mkl_context.input_shape.GetDimension() == 4,
+          errors::InvalidArgument("Input tensor must be 4-dimensional"));
+    } else {
+      OP_REQUIRES(context, input.dims() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional",
+                                          input.shape().DebugString()));
+    }
+
+    if (input_is_mkl) {
+      mkl_context.c_size = mkl_context.input_shape.GetSizes()[MklDims::C];
+    } else if (data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW) {
+      mkl_context.c_size = GetTensorDim(input, data_format_, 'C');
+    } else {
+      errors::InvalidArgument("Unknown format ",
+                              " Format must be either NCHW or NHWC. ");
+    }
+    TensorShape output_shape{mkl_context.c_size};
+
+    Tensor* bias_backprop = nullptr;
+    MklShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(false);
+    AllocateOutputSetMklshape(context, 0, &bias_backprop, output_shape,
+                              output_mkl_shape);
+
+    mkl_context.in_dims = 4;
+
+    if (input_is_mkl) {  // get the shape from the mkl shape
+      mkl_context.in_sizes[MklDims::W] =
+          mkl_context.input_shape.GetSizes()[MklDims::W];
+      mkl_context.in_sizes[MklDims::H] =
+          mkl_context.input_shape.GetSizes()[MklDims::H];
+      mkl_context.in_sizes[MklDims::C] =
+          mkl_context.input_shape.GetSizes()[MklDims::C];
+      mkl_context.in_sizes[MklDims::N] =
+          mkl_context.input_shape.GetSizes()[MklDims::N];
+    } else {
+      mkl_context.in_sizes[MklDims::W] = GetTensorDim(input, data_format_, 'W');
+      mkl_context.in_sizes[MklDims::H] = GetTensorDim(input, data_format_, 'H');
+      mkl_context.in_sizes[MklDims::C] = GetTensorDim(input, data_format_, 'C');
+      mkl_context.in_sizes[MklDims::N] = GetTensorDim(input, data_format_, 'N');
+      GetStridesFromSizes(data_format_, mkl_context.in_strides,
+                          mkl_context.in_sizes);
+    }
+
+    mkl_context.out_sizes[0] = mkl_context.c_size;
+    mkl_context.out_strides[0] = 1;
+
+    CHECK_EQ(
+        dnnConvolutionCreateBackwardBias_F32(
+            &mkl_context.prim_conv_bwdbias, NULL, dnnAlgorithmConvolutionDirect,
+            mkl_context.in_dims, mkl_context.in_sizes),
+        E_SUCCESS);
+
+    mkl_context.MklCreateInputLayouts(context);
+
+    Tensor mkl_tmp_input_buf, mkl_tmp_outbackprop_buf;
+    mkl_context.MklPrepareConvolutionInputs(context, &mkl_tmp_input_buf);
+    mkl_context.MklPrepareConvolutionOutputs(context, &mkl_tmp_outbackprop_buf,
+                                             bias_backprop);
+
+    CHECK_EQ(
+        dnnExecute_F32(mkl_context.prim_conv_bwdbias, mkl_context.conv_res),
+        E_SUCCESS);
+    if (mkl_context.should_convert_output) {
+      CHECK_EQ(dnnConversionExecute_F32(
+                   mkl_context.convert_outbackprop, mkl_context.outbackprop_buf,
+                   static_cast<void*>(bias_backprop->flat<T>().data())),
+               E_SUCCESS);
+    }
+    // deletes layouts
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    int in_dims;
+    int c_size;
+    size_t in_sizes[4];
+    size_t in_strides[4];
+    size_t out_sizes[1];
+    size_t out_strides[1];
+    size_t filter_sizes[4];
+    size_t filter_strides[4];
+    int input_offset[2];
+    size_t conv_stride[2];
+    MklShape input_shape;
+    dnnPrimitive_t prim_conv_bwdbias;
+    void* conv_res[dnnResourceNumber];
+    dnnLayout_t lt_input, lt_outbackprop;
+    bool should_convert_output;
+    dnnPrimitive_t convert_outbackprop;
+    void* outbackprop_buf;
+
+    // Create MKL dnnLayout_t objects for tensors coming into the layer
+    void MklCreateInputLayouts(OpKernelContext* context) {
+      bool input_is_mkl = input_shape.IsMklTensor();
+
+      CHECK_EQ(dnnLayoutCreate_F32(&lt_outbackprop, 1, out_sizes, out_strides),
+               E_SUCCESS);
+      if (input_is_mkl) {
+        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
+      } else {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
+      }
+    }
+
+    // Compare incoming output tensor layouts with MKL preferred layouts and
+    // convert data to the preferred layout if necessary
+    void MklPrepareConvolutionOutputs(OpKernelContext* context,
+                                      Tensor* mkl_tmp_outbackprop_buf,
+                                      Tensor* bias_backprop) {
+      dnnLayout_t mkl_prim_internal_outbackprop = nullptr;
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_prim_internal_outbackprop,
+                                                prim_conv_bwdbias,
+                                                dnnResourceDiffBias),
+               E_SUCCESS);
+      should_convert_output =
+          !dnnLayoutCompare_F32(lt_outbackprop, mkl_prim_internal_outbackprop);
+      if (should_convert_output) {
+        CHECK_EQ(dnnConversionCreate_F32(&convert_outbackprop,
+                                         mkl_prim_internal_outbackprop,
+                                         lt_outbackprop),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_outbackprop_buf,
+                       mkl_prim_internal_outbackprop, &outbackprop_buf);
+        conv_res[dnnResourceDiffBias] = outbackprop_buf;
+      } else {
+        conv_res[dnnResourceDiffBias] =
+            static_cast<void*>(const_cast<T*>(bias_backprop->flat<T>().data()));
+      }
+
+      dnnLayoutDelete_F32(mkl_prim_internal_outbackprop);
+    }
+
+    // Compare incoming input tensor layouts with MKL preferred layouts and
+    // convert data to the preferred layout if necessary
+    void MklPrepareConvolutionInputs(OpKernelContext* context,
+                                     Tensor* mkl_tmp_input_buf) {
+      dnnLayout_t mkl_prim_internal_input = nullptr;
+      dnnPrimitive_t mkl_convert_input = nullptr;
+      void* input_buf = nullptr;
+      const Tensor& input = MklGetInput(context, 0);
+
+      CHECK_EQ(
+          dnnLayoutCreateFromPrimitive_F32(
+              &mkl_prim_internal_input, prim_conv_bwdbias, dnnResourceDiffDst),
+          E_SUCCESS);
+
+      if (!dnnLayoutCompare_F32(lt_input, mkl_prim_internal_input)) {
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_convert_input, lt_input,
+                                         mkl_prim_internal_input),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_input_buf, mkl_prim_internal_input,
+                       &input_buf);
+        CHECK_EQ(dnnConversionExecute_F32(
+                     mkl_convert_input,
+                     static_cast<void*>(const_cast<T*>(input.flat<T>().data())),
+                     input_buf),
+                 E_SUCCESS);
+        conv_res[dnnResourceDiffDst] = input_buf;
+        dnnDelete_F32(mkl_convert_input);
+      } else {
+        conv_res[dnnResourceDiffDst] =
+            static_cast<void*>(const_cast<T*>(input.flat<T>().data()));
+      }
+
+      dnnLayoutDelete_F32(mkl_prim_internal_input);
+    }
+
+    // Cleanup member layouts and primitives
+    void MklCleanup() {
+      bool input_is_mkl = input_shape.IsMklTensor();
+      if (!input_is_mkl) dnnLayoutDelete_F32(lt_input);
+      dnnLayoutDelete_F32(lt_outbackprop);
+
+      if (should_convert_output) dnnDelete_F32(convert_outbackprop);
+      dnnDelete_F32(prim_conv_bwdbias);
+    }
+  } MklConvBackBiasOpContext;
+
+  TensorFormat data_format_;
+  TF_DISALLOW_COPY_AND_ASSIGN(MklConv2DCustomBackpropBiasOp);
+};
+
+#define REGISTER_CPU_KERNELS(T)                                           \
+  REGISTER_KERNEL_BUILDER(Name("MklConv2DWithBiasBackpropBias")           \
+                              .Device(DEVICE_CPU)                         \
+                              .TypeConstraint<T>("T")                     \
+                              .Label(mkl_layer_registry::kMklLayerLabel), \
+                          MklConv2DCustomBackpropBiasOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+} /* namespace tensorflow */
+#endif /* INTEL_MKL */
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
new file mode 100644
index 0000000000..85198d89f5
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -0,0 +1,422 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+
+#ifdef INTEL_MKL
+
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/conv_grad_ops.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/use_cudnn.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, class T>
+class MklConv2DCustomBackpropFilterOp : public OpKernel {
+ public:
+  explicit MklConv2DCustomBackpropFilterOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    int stride_n = GetTensorDim(strides_, data_format_, 'N');
+    int stride_c = GetTensorDim(strides_, data_format_, 'C');
+    OP_REQUIRES(
+        context, (stride_n == 1 && stride_c == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklConv2DGradFilterOpContext mkl_context;
+    const Tensor& input = MklGetInput(context, 0);
+    GetMklShape(context, 0, &(mkl_context.input_shape));
+    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+
+    const Tensor& filter_sizes = MklGetInput(context, 1);
+
+    const Tensor& out_backprop = MklGetInput(context, 2);
+    GetMklShape(context, 2, &(mkl_context.out_backprop_shape));
+    bool out_backprop_in_mkl_format =
+        mkl_context.out_backprop_shape.IsMklTensor();
+
+    TensorShape input_shape, filter_shape, out_backprop_shape;
+
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsVector(filter_sizes.shape()),
+        errors::InvalidArgument(
+            "Conv2DCustomBackpropFilter: filter_sizes input must be 1-dim, "
+            "not ",
+            filter_sizes.dims()));
+    OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
+                                filter_sizes.vec<int32>(), &filter_shape));
+
+    ConvBackpropDimensions backprop_dims;
+
+    // Generate shape for input if input is in MKL format.
+    if (input_in_mkl_format) {
+      OP_REQUIRES(context, mkl_context.input_shape.GetDimension() == 4,
+                  errors::InvalidArgument(
+                      "Conv2DCustomBackpropFilter: input size must be 4-dim"));
+
+      MklSizesToTFSizes(context, data_format_, mkl_context.input_shape,
+                        &input_shape);
+    } else {
+      input_shape = input.shape();
+    }
+
+    // Generate shape for outback prop if input is in MKL format.
+    if (out_backprop_in_mkl_format) {
+      OP_REQUIRES(
+          context, mkl_context.out_backprop_shape.GetDimension() == 4,
+          errors::InvalidArgument(
+              "Conv2DCustomBackpropFilter: outbackprop size must be 4-dim"));
+
+      MklSizesToTFSizes(context, data_format_, mkl_context.out_backprop_shape,
+                        &out_backprop_shape);
+    } else {
+      out_backprop_shape = out_backprop.shape();
+    }
+
+    OP_REQUIRES_OK(context,
+                   ConvBackpropComputeDimensions(
+                       "Conv2DCustomBackpropFilter", /*num_spatial_dims=*/2,
+                       input_shape, filter_shape, out_backprop_shape, strides_,
+                       padding_, data_format_, &backprop_dims));
+
+    int64 pad_top, pad_bottom;
+    int64 pad_left, pad_right;
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                backprop_dims.spatial_dims[0].input_size,
+                                backprop_dims.spatial_dims[0].filter_size,
+                                backprop_dims.spatial_dims[0].stride, padding_,
+                                &backprop_dims.spatial_dims[0].output_size,
+                                &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                backprop_dims.spatial_dims[1].input_size,
+                                backprop_dims.spatial_dims[1].filter_size,
+                                backprop_dims.spatial_dims[1].stride, padding_,
+                                &backprop_dims.spatial_dims[1].output_size,
+                                &pad_left, &pad_right));
+
+    // Create MKL primitives for convolution filter grad
+    mkl_context.in_dims = input_in_mkl_format
+                              ? mkl_context.input_shape.GetDimension()
+                              : input.dims();
+    mkl_context.out_dims = out_backprop_in_mkl_format
+                               ? mkl_context.out_backprop_shape.GetDimension()
+                               : out_backprop.dims();
+    mkl_context.in_sizes[0] =
+        static_cast<size_t>(backprop_dims.spatial_dims[1].input_size);
+    mkl_context.in_sizes[1] =
+        static_cast<size_t>(backprop_dims.spatial_dims[0].input_size);
+    mkl_context.in_sizes[2] = static_cast<size_t>(backprop_dims.in_depth);
+    mkl_context.in_sizes[3] = static_cast<size_t>(backprop_dims.batch_size);
+    mkl_context.out_sizes[0] =
+        static_cast<size_t>(backprop_dims.spatial_dims[1].output_size);
+    mkl_context.out_sizes[1] =
+        static_cast<size_t>(backprop_dims.spatial_dims[0].output_size);
+    mkl_context.out_sizes[2] = static_cast<size_t>(backprop_dims.out_depth);
+    mkl_context.out_sizes[3] = static_cast<size_t>(backprop_dims.batch_size);
+    mkl_context.input_offsets[0] = static_cast<int>(-pad_left);
+    mkl_context.input_offsets[1] = static_cast<int>(-pad_top);
+    mkl_context.conv_strides[0] =
+        static_cast<size_t>(backprop_dims.spatial_dims[1].stride);
+    mkl_context.conv_strides[1] =
+        static_cast<size_t>(backprop_dims.spatial_dims[0].stride);
+
+    GetStridesFromSizes(data_format_, mkl_context.in_strides,
+                        mkl_context.in_sizes);
+    GetStridesFromSizes(data_format_, mkl_context.out_strides,
+                        mkl_context.out_sizes);
+
+    // MKL understands dimensions in 0, 1, 2, and 3 indices denotes
+    // filter cols, rows, input channels, and output depth/channels.
+    mkl_context.filter_dims = 4;
+    mkl_context.filter_sizes[0] = backprop_dims.spatial_dims[1].filter_size;
+    mkl_context.filter_sizes[1] = backprop_dims.spatial_dims[0].filter_size;
+    mkl_context.filter_sizes[2] = backprop_dims.in_depth;
+    mkl_context.filter_sizes[3] = backprop_dims.out_depth;
+
+    // We want filter grad to be in TF format, so
+    // make the strides accordingly to reflect this fact.
+    // Note TF filter layout : (rows, cols, in_depth, out_depth),
+    // while row is the innermost dimension.
+    mkl_context.filter_strides[0] =
+        backprop_dims.out_depth * backprop_dims.in_depth;
+    mkl_context.filter_strides[1] = backprop_dims.out_depth *
+                                    backprop_dims.in_depth *
+                                    backprop_dims.spatial_dims[1].filter_size;
+    mkl_context.filter_strides[2] = backprop_dims.out_depth;
+    mkl_context.filter_strides[3] = 1;
+
+    mkl_context.conv_strides[0] = backprop_dims.spatial_dims[1].stride;
+    mkl_context.conv_strides[1] = backprop_dims.spatial_dims[0].stride;
+
+    // Create convolution-grad-filter primitive
+    CHECK_EQ(dnnConvolutionCreateBackwardFilter_F32(
+                 &mkl_context.prim_conv_bwdfilter, nullptr,
+                 dnnAlgorithmConvolutionDirect, mkl_context.in_dims,
+                 mkl_context.in_sizes, mkl_context.out_sizes,
+                 mkl_context.filter_sizes, mkl_context.conv_strides,
+                 mkl_context.input_offsets, dnnBorderZeros),
+             E_SUCCESS);
+
+    // Create the layouts for entities in received context.
+    mkl_context.MklCreateInputLayouts(context);
+
+    // Mkl needs the entities in its native format.
+    // So create temporary tensors along with buffers to
+    // convert the received entities.
+    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_out_backprop_buf_tensor;
+    // This preparation sets (1) dnnResourceSrc (2) dnnResourceDiffDst
+    mkl_context.MklPrepareInputs(context, &mkl_tmp_input_buf_tensor,
+                                 &mkl_tmp_out_backprop_buf_tensor);
+
+    // Final conv-grad-filter should be in TF layout.
+    Tensor* grad_filter;
+    mkl_context.grad_filter_shape.SetMklTensor(false);
+    mkl_context.grad_filter_shape.SetTfLayout(mkl_context.filter_dims,
+                                              mkl_context.filter_sizes,
+                                              mkl_context.filter_strides);
+    AllocateOutputSetMklshape(context, 0, &grad_filter, filter_shape,
+                              mkl_context.grad_filter_shape);
+
+    // Need to set member variable for TF layout
+    mkl_context.lt_grad_filter = mkl_context.grad_filter_shape.GetTfLayout();
+
+    // MKL conv-grad-filter might produce grad in its internal layout
+    Tensor mkl_tmp_grad_filter_buf_tensor;
+    // This preparation sets conversion primitive if required
+    // and allocates temporary tensor and its buffer without doing conversions.
+    // Also sets (3) dnnResourceDiffFilter accordingly
+    mkl_context.MklPrepareGradFilter(context, grad_filter,
+                                     &mkl_tmp_grad_filter_buf_tensor);
+
+    // After setting all the required dnnResources, ready for execution!
+    CHECK_EQ(
+        dnnExecute_F32(mkl_context.prim_conv_bwdfilter, mkl_context.conv_res),
+        E_SUCCESS);
+
+    // Convert grad-filter to TF layout
+    if (mkl_context.convert_bwdfilter != nullptr) {
+      void* mkl_buf_convert_grad_filter =
+          const_cast<void*>(static_cast<const void*>(
+              mkl_tmp_grad_filter_buf_tensor.flat<T>().data()));
+      void* mkl_buf_grad_filter = const_cast<void*>(
+          static_cast<const void*>(grad_filter->flat<T>().data()));
+      CHECK_EQ(dnnConversionExecute_F32(mkl_context.convert_bwdfilter,
+                                        mkl_buf_convert_grad_filter,
+                                        mkl_buf_grad_filter),
+               E_SUCCESS);
+    }
+
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    int in_dims;
+    size_t in_sizes[4];
+    size_t in_strides[4];
+    int out_dims;
+    size_t out_sizes[4];
+    size_t out_strides[4];
+    int filter_dims;
+    size_t filter_sizes[4];
+    size_t filter_strides[4];
+    int input_offsets[2];
+    size_t conv_strides[2];
+    MklShape input_shape, grad_filter_shape, out_backprop_shape;
+    dnnPrimitive_t prim_conv_bwdfilter, convert_bwdfilter;
+    dnnLayout_t lt_input, lt_grad_filter, lt_out_backprop;
+    void* conv_res[dnnResourceNumber];
+
+    void MklCleanup() {
+      // Cleanup member layouts and primitives except "lt_grad_filter_"
+      // which points to MklShape's TFLayout
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      bool out_backprop_in_mkl_format = out_backprop_shape.IsMklTensor();
+      if (!input_in_mkl_format) dnnLayoutDelete_F32(lt_input);
+      if (!out_backprop_in_mkl_format) dnnLayoutDelete_F32(lt_out_backprop);
+      if (convert_bwdfilter != nullptr) dnnDelete_F32(convert_bwdfilter);
+      dnnDelete_F32(prim_conv_bwdfilter);
+    }
+
+    // Create MKL dnnLayout_t objects for tensors coming into the layer
+    void MklCreateInputLayouts(OpKernelContext* context) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      if (input_in_mkl_format) {
+        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
+      } else {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
+      }
+
+      bool out_backprop_in_mkl_format = out_backprop_shape.IsMklTensor();
+      if (out_backprop_in_mkl_format) {
+        lt_out_backprop =
+            static_cast<dnnLayout_t>(out_backprop_shape.GetCurLayout());
+      } else {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_out_backprop, out_dims, out_sizes,
+                                     out_strides),
+                 E_SUCCESS);
+      }
+    }
+
+    // Compare incoming tensor layouts with MKL preferred layouts and convert
+    // data to the preferred layout if necessary
+    void MklPrepareInputs(OpKernelContext* context,
+                          Tensor* mkl_tmp_input_buf_tensor,
+                          Tensor* mkl_tmp_out_backprop_buf_tensor) {
+      bool mkl_convert_input, mkl_convert_out_backprop;
+      dnnPrimitive_t mkl_prim_convert_input, mkl_prim_convert_out_backprop;
+      dnnLayout_t mkl_lt_internal_input, mkl_lt_internal_out_backprop;
+      void *mkl_buf_convert_input, *mkl_buf_convert_out_backprop;
+
+      mkl_prim_convert_input = nullptr;
+      mkl_prim_convert_out_backprop = nullptr;
+      mkl_lt_internal_input = nullptr;
+      mkl_lt_internal_out_backprop = nullptr;
+      mkl_buf_convert_input = nullptr;
+      mkl_buf_convert_out_backprop = nullptr;
+
+      // Compare with internal layouts and convert if needed
+      const Tensor& input = MklGetInput(context, 0);
+      void* mkl_buf_input =
+          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                   &mkl_lt_internal_input, prim_conv_bwdfilter, dnnResourceSrc),
+               E_SUCCESS);
+      mkl_convert_input =
+          !dnnLayoutCompare_F32(mkl_lt_internal_input, lt_input);
+      if (mkl_convert_input) {
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, lt_input,
+                                         mkl_lt_internal_input),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
+                       &mkl_buf_convert_input);
+        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
+                                          mkl_buf_convert_input),
+                 E_SUCCESS);
+        dnnDelete_F32(mkl_prim_convert_input);
+      }
+      dnnLayoutDelete_F32(mkl_lt_internal_input);
+
+      conv_res[dnnResourceSrc] =
+          (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
+
+      const Tensor& out_backprop = MklGetInput(context, 2);
+      void* mkl_buf_out_backprop = const_cast<void*>(
+          static_cast<const void*>(out_backprop.flat<T>().data()));
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_out_backprop,
+                                                prim_conv_bwdfilter,
+                                                dnnResourceDiffDst),
+               E_SUCCESS);
+      mkl_convert_out_backprop =
+          !dnnLayoutCompare_F32(mkl_lt_internal_out_backprop, lt_out_backprop);
+      if (mkl_convert_out_backprop) {
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_out_backprop,
+                                         lt_out_backprop,
+                                         mkl_lt_internal_out_backprop),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_out_backprop_buf_tensor,
+                       lt_out_backprop, &mkl_buf_convert_out_backprop);
+        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_out_backprop,
+                                          mkl_buf_out_backprop,
+                                          mkl_buf_convert_out_backprop),
+                 E_SUCCESS);
+        dnnDelete_F32(mkl_prim_convert_out_backprop);
+      }
+      dnnLayoutDelete_F32(mkl_lt_internal_out_backprop);
+
+      conv_res[dnnResourceDiffDst] = (mkl_convert_out_backprop)
+                                         ? mkl_buf_convert_out_backprop
+                                         : mkl_buf_out_backprop;
+    }
+
+    void MklPrepareGradFilter(OpKernelContext* context, Tensor* grad_filter,
+                              Tensor* mkl_tmp_grad_filter_buf_tensor) {
+      bool mkl_convert_grad_filter;
+      dnnLayout_t mkl_lt_internal_grad_filter = nullptr;
+      void* mkl_buf_convert_grad_filter = nullptr;
+      void* mkl_buf_grad_filter = const_cast<void*>(
+          static_cast<const void*>(grad_filter->flat<T>().data()));
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_grad_filter,
+                                                prim_conv_bwdfilter,
+                                                dnnResourceDiffFilter),
+               E_SUCCESS);
+      mkl_convert_grad_filter =
+          !dnnLayoutCompare_F32(mkl_lt_internal_grad_filter, lt_grad_filter);
+      if (mkl_convert_grad_filter) {
+        CHECK_EQ(dnnConversionCreate_F32(&convert_bwdfilter,
+                                         mkl_lt_internal_grad_filter,
+                                         lt_grad_filter),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_grad_filter_buf_tensor,
+                       mkl_lt_internal_grad_filter,
+                       &mkl_buf_convert_grad_filter);
+      }
+      dnnLayoutDelete_F32(mkl_lt_internal_grad_filter);
+
+      conv_res[dnnResourceDiffFilter] = (mkl_convert_grad_filter)
+                                            ? mkl_buf_convert_grad_filter
+                                            : mkl_buf_grad_filter;
+    }
+  } MklConv2DGradFilterOpContext;
+
+  std::vector<int32> strides_;
+  Padding padding_;
+  TensorFormat data_format_;
+};
+
+#define REGISTER_MKL_FILTER_KERNELS(T)                                    \
+  REGISTER_KERNEL_BUILDER(Name("MklConv2DBackpropFilter")                 \
+                              .Device(DEVICE_CPU)                         \
+                              .TypeConstraint<T>("T")                     \
+                              .Label(mkl_layer_registry::kMklLayerLabel), \
+                          MklConv2DCustomBackpropFilterOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
+#undef REGISTER_MKL_FILTER_KERNELS
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
new file mode 100644
index 0000000000..c7d95c86bc
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -0,0 +1,355 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc. This opkernel uses MKL library, create MKL
+// layout and primitives, use MKL dnn primitives to compute convolution backward
+// input
+
+#ifdef INTEL_MKL
+
+#define USE_EIGEN_TENSOR
+#define EIGEN_USE_THREADS
+#include <algorithm>
+#include <vector>
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/conv_grad_ops.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/mkl_util.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/use_cudnn.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, class T>
+class MklConv2DCustomBackpropInputOp : public OpKernel {
+ public:
+  ~MklConv2DCustomBackpropInputOp() {}
+  explicit MklConv2DCustomBackpropInputOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string dataformat;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &dataformat));
+    OP_REQUIRES(context, FormatFromString(dataformat, &data_format),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides));
+    int stride_n = GetTensorDim(strides, data_format, 'N');
+    int stride_c = GetTensorDim(strides, data_format, 'C');
+    OP_REQUIRES(
+        context, (stride_n == 1 && stride_c == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklConvBackInputOpContext mkl_context;
+    const Tensor& input = MklGetInput(context, 0);
+    const Tensor& filter = MklGetInput(context, 1);
+
+    GetMklShape(context, 1, &(mkl_context.filter_shape));
+    bool filter_in_mkl_format = mkl_context.filter_shape.IsMklTensor();
+
+    const Tensor& out_backprop = MklGetInput(context, 2);
+    GetMklShape(context, 2, &(mkl_context.outback_shape));
+    bool outback_in_mkl_format = mkl_context.outback_shape.IsMklTensor();
+
+    TensorShape input_shape, filter_shape, outback_shape;
+
+    // Generate input shape.
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsVector(input.shape()),
+        errors::InvalidArgument(
+            "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
+            input.dims()));
+    OP_REQUIRES_OK(
+        context, TensorShapeUtils::MakeShape(input.vec<int32>(), &input_shape));
+
+    // Generate shape for filter prop if input is in MKL format.
+    if (filter_in_mkl_format) {
+      OP_REQUIRES(context, mkl_context.filter_shape.GetDimension() == 4,
+                  errors::InvalidArgument(
+                      "Conv2DCustomBackpropInput: size must be 4-dim"));
+
+      MklSizesToTFSizes(context, data_format, mkl_context.filter_shape,
+                        &filter_shape);
+    } else {
+      filter_shape = filter.shape();
+    }
+
+    // Generate shape for outback prop if input is in MKL format.
+    if (outback_in_mkl_format) {
+      OP_REQUIRES(context, mkl_context.outback_shape.GetDimension() == 4,
+                  errors::InvalidArgument(
+                      "Conv2DCustomBackpropInput: size must be 4-dim"));
+
+      MklSizesToTFSizes(context, data_format, mkl_context.outback_shape,
+                        &outback_shape);
+    } else {
+      outback_shape = out_backprop.shape();
+    }
+
+    ConvBackpropDimensions dims;
+    OP_REQUIRES_OK(
+        context,
+        ConvBackpropComputeDimensions(
+            "Conv2DCustomBackpropInput", /*num_spatial_dims=*/2, input_shape,
+            filter_shape, outback_shape, strides, padding, data_format, &dims));
+
+    int64 pad_top, pad_bottom;
+    int64 pad_left, pad_right;
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+            dims.spatial_dims[0].stride, padding,
+            &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+            dims.spatial_dims[1].stride, padding,
+            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
+
+    mkl_context.in_dims = 4;
+
+    mkl_context.in_sizes[0] =
+        static_cast<size_t>(dims.spatial_dims[1].input_size);
+    mkl_context.in_sizes[1] =
+        static_cast<size_t>(dims.spatial_dims[0].input_size);
+    mkl_context.in_sizes[2] = static_cast<size_t>(dims.in_depth);
+    mkl_context.in_sizes[3] = static_cast<size_t>(dims.batch_size);
+
+    mkl_context.out_sizes[0] =
+        static_cast<size_t>(dims.spatial_dims[1].output_size);
+    mkl_context.out_sizes[1] =
+        static_cast<size_t>(dims.spatial_dims[0].output_size);
+    mkl_context.out_sizes[2] = static_cast<size_t>(dims.out_depth);
+    mkl_context.out_sizes[3] = static_cast<size_t>(dims.batch_size);
+
+    mkl_context.input_offset[0] = static_cast<int>(-pad_left);
+    mkl_context.input_offset[1] = static_cast<int>(-pad_top);
+
+    mkl_context.conv_strides[0] =
+        static_cast<size_t>(dims.spatial_dims[1].stride);
+    mkl_context.conv_strides[1] =
+        static_cast<size_t>(dims.spatial_dims[0].stride);
+
+    GetStridesFromSizes(data_format, mkl_context.out_strides,
+                        mkl_context.out_sizes);
+    GetStridesFromSizes(data_format, mkl_context.in_strides,
+                        mkl_context.in_sizes);
+
+    mkl_context.filter_size[0] = dims.spatial_dims[1].filter_size;
+    mkl_context.filter_size[1] = dims.spatial_dims[0].filter_size;
+    mkl_context.filter_size[2] = dims.in_depth;
+    mkl_context.filter_size[3] = dims.out_depth;
+
+    mkl_context.filter_stride[0] =
+        mkl_context.filter_size[2] * mkl_context.filter_size[3];
+    mkl_context.filter_stride[1] = mkl_context.filter_size[2] *
+                                   mkl_context.filter_size[0] *
+                                   mkl_context.filter_size[3];
+    mkl_context.filter_stride[2] = mkl_context.filter_size[3];
+    mkl_context.filter_stride[3] = 1;
+
+    CHECK_EQ(
+        dnnConvolutionCreateBackwardData_F32(
+            &mkl_context.prim_bwddata, NULL, dnnAlgorithmConvolutionDirect,
+            mkl_context.in_dims, mkl_context.in_sizes, mkl_context.out_sizes,
+            mkl_context.filter_size, mkl_context.conv_strides,
+            mkl_context.input_offset, dnnBorderZeros),
+        E_SUCCESS);
+
+    // Allocate output tensor and shape
+    TensorShape mkl_out_shape;
+    MklShape mklOutputShape;
+    mklOutputShape.SetMklTensor(true);
+    mklOutputShape.SetMklLayout(mkl_context.prim_bwddata, dnnResourceDiffSrc);
+    mklOutputShape.SetTfLayout(mkl_context.in_dims, mkl_context.in_sizes,
+                               mkl_context.in_strides);
+    // MKL might change the dimension ordering.
+    // Create mapping to recover the original TF dimension order
+    mklOutputShape.SetTfDimOrder(mkl_context.in_dims, data_format);
+
+    Tensor* in_backprop = nullptr;
+    mkl_out_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                             mklOutputShape.GetMklLayout())) /
+                         sizeof(T));
+    AllocateOutputSetMklshape(context, 0, &in_backprop, mkl_out_shape,
+                              mklOutputShape);
+
+    mkl_context.conv_res[dnnResourceDiffSrc] =
+        static_cast<void*>(const_cast<T*>(in_backprop->flat<T>().data()));
+
+    mkl_context.MklCreateInputLayouts(context);
+    Tensor mkl_tmp_outbackprop_buf_tensor, mkl_tmp_filter_buf_tensor;
+    mkl_context.MklPrepareConvolutionInputs(
+        context, &mkl_tmp_outbackprop_buf_tensor, &mkl_tmp_filter_buf_tensor);
+
+    CHECK_EQ(dnnExecute_F32(mkl_context.prim_bwddata, mkl_context.conv_res),
+             E_SUCCESS);
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    int in_dims;
+    size_t in_sizes[4];
+    size_t in_strides[4];
+    size_t out_sizes[4];
+    size_t out_strides[4];
+    int input_offset[2];
+    size_t filter_size[4];
+    size_t filter_stride[4];
+    size_t conv_strides[2];
+    MklShape filter_shape, outback_shape;
+    dnnPrimitive_t prim_bwddata;
+    void* conv_res[dnnResourceNumber];
+    dnnLayout_t lt_filter, lt_outbackprop;
+
+    // Create MKL dnnLayout_t objects for tensors coming into the layer
+    void MklCreateInputLayouts(OpKernelContext* context) {
+      bool filter_in_mkl_format = filter_shape.IsMklTensor();
+      bool outback_in_mkl_format = outback_shape.IsMklTensor();
+      if (filter_in_mkl_format) {
+        lt_filter = (dnnLayout_t)filter_shape.GetCurLayout();
+      } else {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_filter, in_dims, filter_size,
+                                     filter_stride),
+                 E_SUCCESS);
+      }
+
+      if (outback_in_mkl_format) {
+        lt_outbackprop = (dnnLayout_t)outback_shape.GetCurLayout();
+      } else {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_outbackprop, in_dims, out_sizes,
+                                     out_strides),
+                 E_SUCCESS);
+      }
+    }
+
+    // Compare incoming input tensor layouts with MKL preferred layouts and
+    // convert data to the preferred layout if necessary
+    void MklPrepareConvolutionInputs(OpKernelContext* context,
+                                     Tensor* mkl_tmp_outbackprop_buf_tensor,
+                                     Tensor* mkl_tmp_filter_buf_tensor) {
+      dnnPrimitive_t mkl_convert_filter = nullptr,
+                     mkl_convert_outbackprop = nullptr;
+      void *mkl_filter_buf = nullptr, *mkl_outbackprop_buf = nullptr;
+      dnnLayout_t mkl_lt_filter_internal = nullptr,
+                  mkl_lt_outbackprop_internal = nullptr;
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                   &mkl_lt_filter_internal, prim_bwddata, dnnResourceFilter),
+               E_SUCCESS);
+
+      const Tensor& filter = MklGetInput(context, 1);
+
+      CHECK_EQ(
+          dnnLayoutCreateFromPrimitive_F32(&mkl_lt_outbackprop_internal,
+                                           prim_bwddata, dnnResourceDiffDst),
+          E_SUCCESS);
+      if (!dnnLayoutCompare_F32(mkl_lt_filter_internal, lt_filter)) {
+        // Create conversion primitive
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_convert_filter, lt_filter,
+                                         mkl_lt_filter_internal),
+                 E_SUCCESS);
+
+        AllocTmpBuffer(context, mkl_tmp_filter_buf_tensor,
+                       mkl_lt_filter_internal, &mkl_filter_buf);
+        CHECK_EQ(
+            dnnConversionExecute_F32(
+                mkl_convert_filter,
+                static_cast<void*>(const_cast<T*>(filter.flat<T>().data())),
+                mkl_filter_buf),
+            E_SUCCESS);
+
+        // Assign filter buf to resources[] for convolution.
+        conv_res[dnnResourceFilter] = mkl_filter_buf;
+        dnnDelete_F32(mkl_convert_filter);
+      } else {
+        // If we do not need any layout conversion for filter, then
+        // we direclty assign input filter to resources[].
+        conv_res[dnnResourceFilter] =
+            static_cast<void*>(const_cast<T*>(filter.flat<T>().data()));
+      }
+      dnnLayoutDelete_F32(mkl_lt_filter_internal);
+      const Tensor& out_backprop = MklGetInput(context, 2);
+      // --
+      // We do similar steps as above for outputbackprop.
+      if (!dnnLayoutCompare_F32(mkl_lt_outbackprop_internal, lt_outbackprop)) {
+        CHECK_EQ(
+            dnnConversionCreate_F32(&mkl_convert_outbackprop, lt_outbackprop,
+                                    mkl_lt_outbackprop_internal),
+            E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_outbackprop_buf_tensor,
+                       mkl_lt_outbackprop_internal, &mkl_outbackprop_buf);
+
+        CHECK_EQ(dnnConversionExecute_F32(mkl_convert_outbackprop,
+                                          static_cast<void*>(const_cast<T*>(
+                                              out_backprop.flat<T>().data())),
+                                          mkl_outbackprop_buf),
+                 E_SUCCESS);
+
+        conv_res[dnnResourceDiffDst] = mkl_outbackprop_buf;
+        dnnDelete_F32(mkl_convert_outbackprop);
+      } else {
+        conv_res[dnnResourceDiffDst] =
+            static_cast<void*>(const_cast<T*>(out_backprop.flat<T>().data()));
+      }
+      dnnLayoutDelete_F32(mkl_lt_outbackprop_internal);
+    }
+
+    // Cleanup member layouts and primitives
+    void MklCleanup() {
+      bool filter_in_mkl_format = filter_shape.IsMklTensor();
+      bool outback_in_mkl_format = outback_shape.IsMklTensor();
+      if (!filter_in_mkl_format) dnnLayoutDelete_F32(lt_filter);
+      if (!outback_in_mkl_format) dnnLayoutDelete_F32(lt_outbackprop);
+      dnnDelete_F32(prim_bwddata);
+    }
+  } MklConvBackInputOpContext;
+
+  std::vector<int32> strides;
+  Padding padding;
+  TensorFormat data_format;
+};
+
+#define REGISTER_MKL_CPU_KERNELS(T)                                       \
+  REGISTER_KERNEL_BUILDER(Name("MklConv2DBackpropInput")                  \
+                              .Device(DEVICE_CPU)                         \
+                              .TypeConstraint<T>("T")                     \
+                              .Label(mkl_layer_registry::kMklLayerLabel), \
+                          MklConv2DCustomBackpropInputOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_MKL_CPU_KERNELS);
+#undef REGISTER_MKL_CPU_KERNELS
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 5a9a82d2e9..e5c4c21a10 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -43,7 +43,6 @@ limitations under the License.
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
-typedef Eigen::GpuDevice GPUDevice;
 
 template <typename Device, typename T, bool biasEnabled>
 class MklConv2DOp : public OpKernel {
@@ -70,9 +69,10 @@ class MklConv2DOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
+    MklConv2DOpContext mkl_context;
     const Tensor& input = MklGetInput(context, 0);
-    GetMklShape(context, 0, &(mkl_params_.input_shape));
-    bool input_in_mkl_format = mkl_params_.input_shape.IsMklTensor();
+    GetMklShape(context, 0, &(mkl_context.input_shape));
+    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
 
     const Tensor& filter = MklGetInput(context, 1);
     MklShape mkl_filter_shape;
@@ -104,9 +104,9 @@ class MklConv2DOp : public OpKernel {
           errors::InvalidArgument("filter too large"));
     }
 
-    const int64 input_depth = input_in_mkl_format
-                                  ? mkl_params_.input_shape.GetSizes()[2]
-                                  : GetTensorDim(input, data_format_, 'C');
+    const int64 input_depth =
+        input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'C')
+                            : GetTensorDim(input, data_format_, 'C');
     OP_REQUIRES(context, input_depth == filter.dim_size(2),
                 errors::InvalidArgument(
                     "input and filter must have the same depth: ", input_depth,
@@ -116,9 +116,9 @@ class MklConv2DOp : public OpKernel {
 
     // The second dimension for input is rows/height.
     // The first dimension for filter is rows/height.
-    const int64 input_rows_raw = input_in_mkl_format
-                                     ? mkl_params_.input_shape.GetSizes()[1]
-                                     : GetTensorDim(input, data_format_, 'H');
+    const int64 input_rows_raw =
+        input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'H')
+                            : GetTensorDim(input, data_format_, 'H');
     OP_REQUIRES(
         context,
         FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()),
@@ -128,9 +128,9 @@ class MklConv2DOp : public OpKernel {
 
     // The third dimension for input is columns/width.
     // The second dimension for filter is columns/width.
-    const int64 input_cols_raw = input_in_mkl_format
-                                     ? mkl_params_.input_shape.GetSizes()[0]
-                                     : GetTensorDim(input, data_format_, 'W');
+    const int64 input_cols_raw =
+        input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'W')
+                            : GetTensorDim(input, data_format_, 'W');
     OP_REQUIRES(
         context,
         FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()),
@@ -139,9 +139,9 @@ class MklConv2DOp : public OpKernel {
     const int filter_cols = static_cast<int>(filter.dim_size(1));
 
     // The first dimension for input is batch.
-    const int64 input_batch_raw = input_in_mkl_format
-                                      ? mkl_params_.input_shape.GetSizes()[3]
-                                      : GetTensorDim(input, data_format_, 'N');
+    const int64 input_batch_raw =
+        input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'N')
+                            : GetTensorDim(input, data_format_, 'N');
     OP_REQUIRES(
         context,
         FastBoundsCheck(input_batch_raw, std::numeric_limits<int>::max()),
@@ -184,98 +184,105 @@ class MklConv2DOp : public OpKernel {
     }
 
     // Create MKL convolution primitives
-    mkl_params_.in_dims = input_in_mkl_format
-                              ? mkl_params_.input_shape.GetDimension()
+    mkl_context.in_dims = input_in_mkl_format
+                              ? mkl_context.input_shape.GetDimension()
                               : input.dims();
-    mkl_params_.filter_dims = filter.dims();
-    mkl_params_.in_sizes[0] = static_cast<size_t>(input_cols);
-    mkl_params_.in_sizes[1] = static_cast<size_t>(input_rows);
-    mkl_params_.in_sizes[2] = static_cast<size_t>(input_depth);
-    mkl_params_.in_sizes[3] = static_cast<size_t>(batch);
-    mkl_params_.out_sizes[0] = static_cast<size_t>(out_cols);
-    mkl_params_.out_sizes[1] = static_cast<size_t>(out_rows);
-    mkl_params_.out_sizes[2] = static_cast<size_t>(out_depth);
-    mkl_params_.out_sizes[3] = static_cast<size_t>(batch);
-    mkl_params_.input_offset[0] = static_cast<int>(-pad_cols);
-    mkl_params_.input_offset[1] = static_cast<int>(-pad_rows);
-    mkl_params_.conv_stride[0] = static_cast<size_t>(stride_cols);
-    mkl_params_.conv_stride[1] = static_cast<size_t>(stride_rows);
-
-    GetStridesFromSizes(data_format_, mkl_params_.out_strides,
-                        mkl_params_.out_sizes);
-    GetStridesFromSizes(data_format_, mkl_params_.in_strides,
-                        mkl_params_.in_sizes);
+    mkl_context.filter_dims = filter.dims();
+
+    mkl_context.in_sizes[MklDims::W] = static_cast<size_t>(input_cols);
+    mkl_context.in_sizes[MklDims::H] = static_cast<size_t>(input_rows);
+    mkl_context.in_sizes[MklDims::C] = static_cast<size_t>(input_depth);
+    mkl_context.in_sizes[MklDims::N] = static_cast<size_t>(batch);
+
+    mkl_context.out_sizes[MklDims::W] = static_cast<size_t>(out_cols);
+    mkl_context.out_sizes[MklDims::H] = static_cast<size_t>(out_rows);
+    mkl_context.out_sizes[MklDims::C] = static_cast<size_t>(out_depth);
+    mkl_context.out_sizes[MklDims::N] = static_cast<size_t>(batch);
+
+    mkl_context.input_offset[0] = static_cast<int>(-pad_cols);
+    mkl_context.input_offset[1] = static_cast<int>(-pad_rows);
+
+    mkl_context.conv_stride[0] = static_cast<size_t>(stride_cols);
+    mkl_context.conv_stride[1] = static_cast<size_t>(stride_rows);
+
+    GetStridesFromSizes(data_format_, mkl_context.out_strides,
+                        mkl_context.out_sizes);
+    GetStridesFromSizes(data_format_, mkl_context.in_strides,
+                        mkl_context.in_sizes);
 
     // TF filter dimension order (out_depth, in_depth, cols, rows) ->
     // MKL filter dimension order (out_depth, in_depth, rows, cols)
-    mkl_params_.filter_sizes[0] = filter.dim_size(1);  // cols
-    mkl_params_.filter_sizes[1] = filter.dim_size(0);  // rows
-    mkl_params_.filter_sizes[2] = filter.dim_size(2);  // in_depth
-    mkl_params_.filter_sizes[3] = filter.dim_size(3);  // out_depth
+    mkl_context.filter_sizes[0] = filter.dim_size(1);  // cols
+    mkl_context.filter_sizes[1] = filter.dim_size(0);  // rows
+    mkl_context.filter_sizes[2] = filter.dim_size(2);  // in_depth
+    mkl_context.filter_sizes[3] = filter.dim_size(3);  // out_depth
 
     // TF filter layout - (rows, cols, in_depth, out_depth)
-    mkl_params_.filter_strides[0] =
+    mkl_context.filter_strides[0] =
         filter.dim_size(2) * filter.dim_size(3);  // cols
-    mkl_params_.filter_strides[1] =
+    mkl_context.filter_strides[1] =
         filter.dim_size(1) * filter.dim_size(2) * filter.dim_size(3);  // rows
-    mkl_params_.filter_strides[2] = filter.dim_size(3);  // in_depth
-    mkl_params_.filter_strides[3] = 1;                   // out_depth
+    mkl_context.filter_strides[2] = filter.dim_size(3);  // in_depth
+    mkl_context.filter_strides[3] = 1;                   // out_depth
 
     if (biasEnabled) {
       const Tensor& bias = MklGetInput(context, 2);
-      mkl_params_.bias_sizes[0] = {static_cast<size_t>(bias.dim_size(0))};
-      mkl_params_.bias_strides[0] = {1};
+      mkl_context.bias_sizes[0] = {static_cast<size_t>(bias.dim_size(0))};
+      mkl_context.bias_strides[0] = {1};
     }
 
     // Create Convolution Primitive
     if (biasEnabled) {
-      CHECK_EQ(dnnConvolutionCreateForwardBias_F32(
-                   &mkl_prim_convolution_fwd_, nullptr,
-                   dnnAlgorithmConvolutionDirect, mkl_params_.in_dims,
-                   mkl_params_.in_sizes, mkl_params_.out_sizes,
-                   mkl_params_.filter_sizes, mkl_params_.conv_stride,
-                   mkl_params_.input_offset, dnnBorderZeros),
-               E_SUCCESS);
+      CHECK_EQ(
+          dnnConvolutionCreateForwardBias_F32(
+              &mkl_context.prim_fwd, nullptr, dnnAlgorithmConvolutionDirect,
+              mkl_context.in_dims, mkl_context.in_sizes, mkl_context.out_sizes,
+              mkl_context.filter_sizes, mkl_context.conv_stride,
+              mkl_context.input_offset, dnnBorderZeros),
+          E_SUCCESS);
     } else {
-      CHECK_EQ(dnnConvolutionCreateForward_F32(
-                   &mkl_prim_convolution_fwd_, nullptr,
-                   dnnAlgorithmConvolutionDirect, mkl_params_.in_dims,
-                   mkl_params_.in_sizes, mkl_params_.out_sizes,
-                   mkl_params_.filter_sizes, mkl_params_.conv_stride,
-                   mkl_params_.input_offset, dnnBorderZeros),
-               E_SUCCESS);
+      CHECK_EQ(
+          dnnConvolutionCreateForward_F32(
+              &mkl_context.prim_fwd, nullptr, dnnAlgorithmConvolutionDirect,
+              mkl_context.in_dims, mkl_context.in_sizes, mkl_context.out_sizes,
+              mkl_context.filter_sizes, mkl_context.conv_stride,
+              mkl_context.input_offset, dnnBorderZeros),
+          E_SUCCESS);
     }
 
     TensorShape mkl_output_tf_shape;
     MklShape mkl_output_mkl_shape;
     mkl_output_mkl_shape.SetMklTensor(true);
-    mkl_output_mkl_shape.SetMklLayout(mkl_prim_convolution_fwd_,
-                                      dnnResourceDst);
-    mkl_output_mkl_shape.SetTfLayout(mkl_params_.in_dims, mkl_params_.out_sizes,
-                                     mkl_params_.out_strides);
+    mkl_output_mkl_shape.SetMklLayout(mkl_context.prim_fwd, dnnResourceDst);
+    mkl_output_mkl_shape.SetTfLayout(mkl_context.in_dims, mkl_context.out_sizes,
+                                     mkl_context.out_strides);
+    // MKL might change the dimension ordering
+    // Create mapping to recover the original TF dimension order
+    mkl_output_mkl_shape.SetTfDimOrder(mkl_context.in_dims, data_format_);
+
     mkl_output_tf_shape.AddDim(
         dnnLayoutGetMemorySize_F32(
             static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
         sizeof(T));
     AllocateOutputSetMklshape(context, 0, &output, mkl_output_tf_shape,
                               mkl_output_mkl_shape);
-    mkl_conv_res_[dnnResourceDst] =
+    mkl_context.conv_res[dnnResourceDst] =
         static_cast<void*>(output->flat<T>().data());
 
-    MklCreateInputLayouts(context);
+    mkl_context.MklCreateInputLayouts(context);
 
     Tensor mkl_tmp_input_buf_tensor, mkl_tmp_filter_buf_tensor,
         mkl_tmp_bias_buf_tensor;  // Temp tensor used to allocate tmp
                                   // buffers
-    MklPrepareConvolutionInputs(context, &mkl_tmp_input_buf_tensor,
-                                &mkl_tmp_filter_buf_tensor,
-                                &mkl_tmp_bias_buf_tensor);
+    mkl_context.MklPrepareConvolutionInputs(context, &mkl_tmp_input_buf_tensor,
+                                            &mkl_tmp_filter_buf_tensor,
+                                            &mkl_tmp_bias_buf_tensor);
 
     // Execute convolution
-    CHECK_EQ(dnnExecute_F32(mkl_prim_convolution_fwd_, mkl_conv_res_),
+    CHECK_EQ(dnnExecute_F32(mkl_context.prim_fwd, mkl_context.conv_res),
              E_SUCCESS);
 
-    MklCleanup();
+    mkl_context.MklCleanup();
   }
 
  private:
@@ -293,151 +300,141 @@ class MklConv2DOp : public OpKernel {
     int input_offset[2];
     size_t conv_stride[2];
     MklShape input_shape;
-  } MklConv2DOpParams;
-
-  // Create MKL dnnLayout_t objects for tensors coming into the layer
-  void MklCreateInputLayouts(OpKernelContext* context) {
-    bool input_in_mkl_format = mkl_params_.input_shape.IsMklTensor();
-    if (input_in_mkl_format) {
-      mkl_lt_input_ =
-          static_cast<dnnLayout_t>(mkl_params_.input_shape.GetCurLayout());
-    } else {
-      CHECK_EQ(
-          dnnLayoutCreate_F32(&mkl_lt_input_, mkl_params_.in_dims,
-                              mkl_params_.in_sizes, mkl_params_.in_strides),
-          E_SUCCESS);
-    }
-
-    CHECK_EQ(dnnLayoutCreate_F32(&mkl_lt_filter_, mkl_params_.filter_dims,
-                                 mkl_params_.filter_sizes,
-                                 mkl_params_.filter_strides),
-             E_SUCCESS);
+    dnnPrimitive_t prim_fwd;
+    void* conv_res[dnnResourceNumber];
+    dnnLayout_t lt_filter, lt_bias, lt_input;
+
+    // Create MKL dnnLayout_t objects for tensors coming into the layer
+    void MklCreateInputLayouts(OpKernelContext* context) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      if (input_in_mkl_format) {
+        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
+      } else {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
+      }
 
-    if (biasEnabled) {
-      CHECK_EQ(dnnLayoutCreate_F32(&mkl_lt_bias_, 1, mkl_params_.bias_sizes,
-                                   mkl_params_.bias_strides),
+      CHECK_EQ(dnnLayoutCreate_F32(&lt_filter, filter_dims, filter_sizes,
+                                   filter_strides),
                E_SUCCESS);
-    }
-  }
 
-  // Compare incoming tensor layouts with MKL preferred layouts and convert
-  // data to the preferred layout if necessary
-  void MklPrepareConvolutionInputs(OpKernelContext* context,
-                                   Tensor* mkl_tmp_input_buf_tensor,
-                                   Tensor* mkl_tmp_filter_buf_tensor,
-                                   Tensor* mkl_tmp_bias_buf_tensor) {
-    bool mkl_convert_input, mkl_convert_filter, mkl_convert_bias;
-    dnnPrimitive_t mkl_prim_convert_filter, mkl_prim_convert_bias,
-        mkl_prim_convert_input;
-    dnnLayout_t mkl_lt_internal_filter, mkl_lt_internal_bias,
-        mkl_lt_internal_input;
-    void *mkl_buf_convert_input, *mkl_buf_convert_filter, *mkl_buf_convert_bias;
-    mkl_prim_convert_filter = nullptr;
-    mkl_prim_convert_bias = nullptr;
-    mkl_prim_convert_input = nullptr;
-    mkl_lt_internal_filter = nullptr;
-    mkl_lt_internal_bias = nullptr;
-    mkl_lt_internal_input = nullptr;
-    mkl_buf_convert_input = nullptr;
-    mkl_buf_convert_filter = nullptr;
-    mkl_buf_convert_bias = nullptr;
-
-    // Compare with internal layouts and convert if needed
-    const Tensor& input = MklGetInput(context, 0);
-    void* mkl_buf_input =
-        const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
-    CHECK_EQ(
-        dnnLayoutCreateFromPrimitive_F32(
-            &mkl_lt_internal_input, mkl_prim_convolution_fwd_, dnnResourceSrc),
-        E_SUCCESS);
-    mkl_convert_input =
-        !dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_input_);
-    if (mkl_convert_input) {
-      CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, mkl_lt_input_,
-                                       mkl_lt_internal_input),
-               E_SUCCESS);
-      AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
-                     &mkl_buf_convert_input);
-      CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
-                                        mkl_buf_convert_input),
-               E_SUCCESS);
-      dnnDelete_F32(mkl_prim_convert_input);
+      if (biasEnabled) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_bias, 1, bias_sizes, bias_strides),
+                 E_SUCCESS);
+      }
     }
-    dnnLayoutDelete_F32(mkl_lt_internal_input);
-
-    mkl_conv_res_[dnnResourceSrc] =
-        (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
 
-    const Tensor& filter = MklGetInput(context, 1);
-    void* mkl_buf_filter =
-        const_cast<void*>(static_cast<const void*>(filter.flat<T>().data()));
-    CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_filter,
-                                              mkl_prim_convolution_fwd_,
-                                              dnnResourceFilter),
-             E_SUCCESS);
-    mkl_convert_filter =
-        !dnnLayoutCompare_F32(mkl_lt_internal_filter, mkl_lt_filter_);
-    if (mkl_convert_filter) {
-      CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_filter, mkl_lt_filter_,
-                                       mkl_lt_internal_filter),
-               E_SUCCESS);
-      AllocTmpBuffer(context, mkl_tmp_filter_buf_tensor, mkl_lt_internal_filter,
-                     &mkl_buf_convert_filter);
-      CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_filter, mkl_buf_filter,
-                                        mkl_buf_convert_filter),
+    // Compare incoming tensor layouts with MKL preferred layouts and convert
+    // data to the preferred layout if necessary
+    void MklPrepareConvolutionInputs(OpKernelContext* context,
+                                     Tensor* mkl_tmp_input_buf_tensor,
+                                     Tensor* mkl_tmp_filter_buf_tensor,
+                                     Tensor* mkl_tmp_bias_buf_tensor) {
+      bool mkl_convert_input, mkl_convert_filter, mkl_convert_bias;
+      dnnPrimitive_t mkl_prim_convert_filter, mkl_prim_convert_bias,
+          mkl_prim_convert_input;
+      dnnLayout_t mkl_lt_internal_filter, mkl_lt_internal_bias,
+          mkl_lt_internal_input;
+      void *mkl_buf_convert_input, *mkl_buf_convert_filter,
+          *mkl_buf_convert_bias;
+      mkl_prim_convert_filter = nullptr;
+      mkl_prim_convert_bias = nullptr;
+      mkl_prim_convert_input = nullptr;
+      mkl_lt_internal_filter = nullptr;
+      mkl_lt_internal_bias = nullptr;
+      mkl_lt_internal_input = nullptr;
+      mkl_buf_convert_input = nullptr;
+      mkl_buf_convert_filter = nullptr;
+      mkl_buf_convert_bias = nullptr;
+
+      // Compare with internal layouts and convert if needed
+      const Tensor& input = MklGetInput(context, 0);
+      void* mkl_buf_input =
+          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_input,
+                                                prim_fwd, dnnResourceSrc),
                E_SUCCESS);
-      dnnDelete_F32(mkl_prim_convert_filter);
-    }
-    dnnLayoutDelete_F32(mkl_lt_internal_filter);
+      mkl_convert_input =
+          !dnnLayoutCompare_F32(mkl_lt_internal_input, lt_input);
+      if (mkl_convert_input) {
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, lt_input,
+                                         mkl_lt_internal_input),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
+                       &mkl_buf_convert_input);
+        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
+                                          mkl_buf_convert_input),
+                 E_SUCCESS);
+        dnnDelete_F32(mkl_prim_convert_input);
+      }
+      dnnLayoutDelete_F32(mkl_lt_internal_input);
 
-    mkl_conv_res_[dnnResourceFilter] =
-        (mkl_convert_filter) ? mkl_buf_convert_filter : mkl_buf_filter;
+      conv_res[dnnResourceSrc] =
+          (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
 
-    if (biasEnabled) {
-      const Tensor& bias = MklGetInput(context, 2);
-      void* mkl_buf_bias =
-          const_cast<void*>(static_cast<const void*>(bias.flat<T>().data()));
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_bias,
-                                                mkl_prim_convolution_fwd_,
-                                                dnnResourceBias),
+      const Tensor& filter = MklGetInput(context, 1);
+      void* mkl_buf_filter =
+          const_cast<void*>(static_cast<const void*>(filter.flat<T>().data()));
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_filter,
+                                                prim_fwd, dnnResourceFilter),
                E_SUCCESS);
-      mkl_convert_bias =
-          !dnnLayoutCompare_F32(mkl_lt_internal_bias, mkl_lt_bias_);
-      if (mkl_convert_bias) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_bias, mkl_lt_bias_,
-                                         mkl_lt_internal_bias),
+      mkl_convert_filter =
+          !dnnLayoutCompare_F32(mkl_lt_internal_filter, lt_filter);
+      if (mkl_convert_filter) {
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_filter, lt_filter,
+                                         mkl_lt_internal_filter),
                  E_SUCCESS);
-        AllocTmpBuffer(context, mkl_tmp_bias_buf_tensor, mkl_lt_internal_bias,
-                       &mkl_buf_convert_bias);
-        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_bias, mkl_buf_bias,
-                                          mkl_buf_convert_bias),
-                 E_SUCCESS);
-        dnnDelete_F32(mkl_prim_convert_bias);
+        AllocTmpBuffer(context, mkl_tmp_filter_buf_tensor,
+                       mkl_lt_internal_filter, &mkl_buf_convert_filter);
+        CHECK_EQ(
+            dnnConversionExecute_F32(mkl_prim_convert_filter, mkl_buf_filter,
+                                     mkl_buf_convert_filter),
+            E_SUCCESS);
+        dnnDelete_F32(mkl_prim_convert_filter);
       }
-      dnnLayoutDelete_F32(mkl_lt_internal_bias);
+      dnnLayoutDelete_F32(mkl_lt_internal_filter);
+
+      conv_res[dnnResourceFilter] =
+          (mkl_convert_filter) ? mkl_buf_convert_filter : mkl_buf_filter;
 
-      mkl_conv_res_[dnnResourceBias] =
-          (mkl_convert_bias) ? mkl_buf_convert_bias : mkl_buf_bias;
+      if (biasEnabled) {
+        const Tensor& bias = MklGetInput(context, 2);
+        void* mkl_buf_bias =
+            const_cast<void*>(static_cast<const void*>(bias.flat<T>().data()));
+        CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_bias,
+                                                  prim_fwd, dnnResourceBias),
+                 E_SUCCESS);
+        mkl_convert_bias = !dnnLayoutCompare_F32(mkl_lt_internal_bias, lt_bias);
+        if (mkl_convert_bias) {
+          CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_bias, lt_bias,
+                                           mkl_lt_internal_bias),
+                   E_SUCCESS);
+          AllocTmpBuffer(context, mkl_tmp_bias_buf_tensor, mkl_lt_internal_bias,
+                         &mkl_buf_convert_bias);
+          CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_bias, mkl_buf_bias,
+                                            mkl_buf_convert_bias),
+                   E_SUCCESS);
+          dnnDelete_F32(mkl_prim_convert_bias);
+        }
+        dnnLayoutDelete_F32(mkl_lt_internal_bias);
+
+        conv_res[dnnResourceBias] =
+            (mkl_convert_bias) ? mkl_buf_convert_bias : mkl_buf_bias;
+      }
     }
-  }
 
-  void MklCleanup() {
-    bool input_in_mkl_format = mkl_params_.input_shape.IsMklTensor();
-    dnnDelete_F32(mkl_prim_convolution_fwd_);
-    if (!input_in_mkl_format) dnnLayoutDelete_F32(mkl_lt_input_);
-    dnnLayoutDelete_F32(mkl_lt_filter_);
-    if (biasEnabled) dnnLayoutDelete_F32(mkl_lt_bias_);
-  }
+    void MklCleanup() {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      dnnDelete_F32(prim_fwd);
+      if (!input_in_mkl_format) dnnLayoutDelete_F32(lt_input);
+      dnnLayoutDelete_F32(lt_filter);
+      if (biasEnabled) dnnLayoutDelete_F32(lt_bias);
+    }
+  } MklConv2DOpContext;
 
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
-
-  MklConv2DOpParams mkl_params_;
-  dnnPrimitive_t mkl_prim_convolution_fwd_ = nullptr;
-  void* mkl_conv_res_[dnnResourceNumber];
-  dnnLayout_t mkl_lt_filter_ = nullptr, mkl_lt_bias_ = nullptr,
-              mkl_lt_input_ = nullptr;
 };
 
 #define REGISTER_MKL_CPU(T)                                               \
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
new file mode 100644
index 0000000000..9d6cfb0c97
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -0,0 +1,506 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+#ifdef INTEL_MKL
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/mkl_util.h"
+#include "tensorflow/core/util/padding.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+// An implementation of MaxPooling (forward).
+template <typename Device, typename T>
+class MklMaxPoolingOp : public OpKernel {
+ public:
+  explicit MklMaxPoolingOp(OpKernelConstruction* context) : OpKernel(context) {
+    string data_format;
+
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window stride field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented("Pooling is not yet supported on the "
+                                      "batch dimension."));
+
+    workspace_enabled_ = false;
+    // We may not get this attribute for this node if it does not go through
+    // graph rewrite pass. So we do not check for error while retrieving this
+    // attribute value.
+    context->GetAttr("workspace_enabled", &workspace_enabled_);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklMaxPoolingOpContext mkl_context;
+    // Get the input tensor
+    const Tensor& tensor_in = MklGetInput(context, 0);
+    GetMklShape(context, 0, &mkl_context.input_shape);
+    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+
+    mkl_context.params.in_dim = 4;
+    MklPoolParameters pool_params;
+    if (input_in_mkl_format == false) {
+      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                       tensor_in.shape());
+      OP_REQUIRES(
+          context, (pool_params.depth_window == 1),
+          errors::Unimplemented("Depthwise max pooling not supported by MKL"));
+
+    } else {
+      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                       &mkl_context.input_shape);
+    }
+
+    // Extract the parameters for the op from the pooling specs
+
+    ExtractMklOpParams(context, data_format_, pool_params, &mkl_context.params);
+
+    mkl_context.MklCreateLayoutsAndPrimitives(context);
+
+    // Declare output tensor
+    TensorShape tensor_out_shape;
+    MklShape mkl_out_shape;
+    mkl_out_shape.SetMklTensor(true);
+    mkl_out_shape.SetMklLayout(mkl_context.prim_pooling_fwd, dnnResourceDst);
+    mkl_out_shape.SetTfLayout(mkl_context.params.in_dim,
+                              mkl_context.params.out_sizes,
+                              mkl_context.params.out_strides);
+    mkl_out_shape.SetTfDimOrder(mkl_context.params.in_dim, data_format_);
+
+    Tensor* output_tensor = nullptr;
+    tensor_out_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                                mkl_out_shape.GetMklLayout())) /
+                            sizeof(T));
+    AllocateOutputSetMklshape(context, 0, &output_tensor, tensor_out_shape,
+                              mkl_out_shape);
+
+    if (!workspace_enabled_) {
+      mkl_out_shape.SetMklTensor(false);
+    }
+
+    Tensor* workspace_tensor;
+    void* workspace_buf = nullptr;
+    if (workspace_enabled_) {
+      TensorShape workspace_shape;
+      workspace_shape.AddDim(
+          dnnLayoutGetMemorySize_F32(
+              static_cast<dnnLayout_t>(mkl_context.lt_workspace)) /
+          sizeof(T));
+      AllocateOutputSetMklshape(context, 1, &workspace_tensor, workspace_shape,
+                                mkl_out_shape);
+      mkl_context.pooling_res[dnnResourceWorkspace] = const_cast<void*>(
+          static_cast<const void*>(workspace_tensor->flat<T>().data()));
+    } else {
+      AllocTmpBuffer(context, workspace_tensor, mkl_context.lt_workspace,
+                     &workspace_buf);
+      mkl_context.pooling_res[dnnResourceWorkspace] = workspace_buf;
+    }
+
+    mkl_context.pooling_res[dnnResourceSrc] =
+        const_cast<void*>(static_cast<const void*>(tensor_in.flat<T>().data()));
+    mkl_context.pooling_res[dnnResourceDst] = const_cast<void*>(
+        static_cast<const void*>(output_tensor->flat<T>().data()));
+
+    CHECK_EQ(
+        dnnExecute_F32(mkl_context.prim_pooling_fwd, mkl_context.pooling_res),
+        E_SUCCESS);
+
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    MklPoolingOpParams params;
+    MklShape input_shape;
+    void* pooling_res[dnnResourceNumber];
+    dnnPrimitive_t prim_pooling_fwd;
+    dnnLayout_t lt_user_input, lt_workspace;
+
+    void MklCreateLayoutsAndPrimitives(OpKernelContext* context) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      // Create or use existing DNN user layout
+      if (input_in_mkl_format == false) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_user_input, params.in_dim,
+                                     params.in_sizes, params.in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_user_input = (dnnLayout_t)input_shape.GetCurLayout();
+      }
+
+      dnnAlgorithm_t algorithm = dnnAlgorithmPoolingMax;
+      dnnPrimitiveAttributes_t primAttr = nullptr;
+
+      // Create DNN primitives
+      CHECK_EQ(dnnPoolingCreateForward_F32(
+                   &prim_pooling_fwd, primAttr, algorithm, lt_user_input,
+                   params.kernel_size, params.kernel_stride, params.in_offset,
+                   dnnBorderZerosAsymm),
+               E_SUCCESS);
+
+      // Creates layout for the workspace
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_workspace, prim_pooling_fwd,
+                                                dnnResourceWorkspace),
+               E_SUCCESS);
+    }
+
+    void MklCleanup() {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      CHECK_EQ(dnnDelete_F32(prim_pooling_fwd), E_SUCCESS);
+      if (!input_in_mkl_format) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_user_input), E_SUCCESS);
+      }
+      CHECK_EQ(dnnLayoutDelete_F32(lt_workspace), E_SUCCESS);
+    }
+  } MklMaxPoolingOpContext;
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+  bool workspace_enabled_;
+};
+
+// The operation to compute MaxPool gradients.
+// It takes three inputs:
+//   - The original input tensor
+//   - The original output tensor
+//   - Backprop tensor for output
+// It produces one output: backprop tensor for input.
+template <class Device, class T>
+class MklMaxPoolingGradOp : public OpKernel {
+ public:
+  explicit MklMaxPoolingGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+    workspace_enabled_ = false;
+    // We may not get this attribute for this node if it does not go through
+    // graph rewrite pass. So we do not check for error while retrieving this
+    // attribute value.
+    context->GetAttr("workspace_enabled", &workspace_enabled_);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklMaxPoolingGradOpContext mkl_context;
+    // Input - The original input tensor
+    const Tensor& tensor_in = MklGetInput(context, 0);
+
+    // Output - Backprop tensor for input.
+    Tensor* output_tensor = nullptr;
+
+    GetMklShape(context, 0, &mkl_context.input_shape);
+    GetMklShape(context, 2, &mkl_context.output_backprop_shape);
+    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+
+    if (input_in_mkl_format == false)
+      mkl_context.params.in_dim = tensor_in.dims();
+    else
+      mkl_context.params.in_dim = mkl_context.input_shape.GetDimension();
+
+    MklPoolParameters pool_params;
+    if (input_in_mkl_format == false) {
+      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                       tensor_in.shape());
+      OP_REQUIRES(
+          context, (pool_params.depth_window == 1),
+          errors::Unimplemented("Depthwise max pooling not supported by MKL"));
+
+    } else {
+      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
+                       &mkl_context.input_shape);
+    }
+
+    // Extract the parameters for the op from the pooling specs
+    ExtractMklOpParams(context, data_format_, pool_params, &mkl_context.params);
+
+    mkl_context.MklCreateLayouts(context);
+    mkl_context.MklCreatePrimitives(context, workspace_enabled_);
+    mkl_context.MklPrepareInputs(context, workspace_enabled_);
+
+    // Create shape for the input back prop output
+    TensorShape mkl_input_backprop;
+    MklShape mkl_output_shape;
+    mkl_output_shape.SetMklTensor(true);
+    mkl_output_shape.SetMklLayout(mkl_context.prim_pooling_bwd,
+                                  dnnResourceDiffSrc);
+    mkl_output_shape.SetTfLayout(mkl_context.params.in_dim,
+                                 mkl_context.params.in_sizes,
+                                 mkl_context.params.in_strides);
+    mkl_output_shape.SetTfDimOrder(mkl_context.params.in_dim, data_format_);
+
+    mkl_input_backprop.AddDim(
+        dnnLayoutGetMemorySize_F32(
+            static_cast<dnnLayout_t>(mkl_output_shape.GetMklLayout())) /
+        sizeof(T));
+    AllocateOutputSetMklshape(context, 0, &output_tensor, mkl_input_backprop,
+                              mkl_output_shape);
+    mkl_context.pooling_res[dnnResourceDiffSrc] = const_cast<void*>(
+        static_cast<const void*>(output_tensor->flat<T>().data()));
+
+    int64 output_size = output_tensor->NumElements();
+    for (int64 i = 0; i < output_size; ++i) {
+      (static_cast<float*>(mkl_context.pooling_res[dnnResourceDiffSrc]))[i] = 0;
+    }
+
+    CHECK_EQ(
+        dnnExecute_F32(mkl_context.prim_pooling_bwd, mkl_context.pooling_res),
+        E_SUCCESS);
+
+    mkl_context.MklCleanup(workspace_enabled_);
+  }
+
+ private:
+  typedef struct {
+    MklPoolingOpParams params;
+    MklShape input_shape, output_backprop_shape;
+    void* pooling_resfwd[dnnResourceNumber];
+    void* pooling_res[dnnResourceNumber];
+    dnnPrimitive_t prim_pooling_fwd, prim_pooling_bwd, convert_input,
+        convert_outbackprop;
+    dnnLayout_t lt_outbackprop_user, lt_outbackprop_prim, lt_input_user,
+        lt_input_prim;
+    void* input_buf;
+    void* outbackprop_buf;
+
+    void MklCreateLayouts(OpKernelContext* context) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      bool outbackprop_in_mkl_format = output_backprop_shape.IsMklTensor();
+      // Create DNN user layout for input and outbackprop or get existing layout
+      if (input_in_mkl_format == false) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input_user, params.in_dim,
+                                     params.in_sizes, params.in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_input_user = (dnnLayout_t)input_shape.GetCurLayout();
+      }
+
+      // We dont care about the output layout for now as we can create it from
+      // primitives for the max pooling fwd prop
+      if (outbackprop_in_mkl_format == false) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_outbackprop_user, params.in_dim,
+                                     params.out_sizes, params.out_strides),
+                 E_SUCCESS);
+      } else {
+        lt_outbackprop_user = (dnnLayout_t)output_backprop_shape.GetCurLayout();
+      }
+    }
+
+    // Create DNN primitives
+    void MklCreatePrimitives(OpKernelContext* context, bool workspace_enabled) {
+      dnnAlgorithm_t algorithm = dnnAlgorithmPoolingMax;
+      dnnPrimitiveAttributes_t primAttr = nullptr;
+
+      if (workspace_enabled == false) {
+        CHECK_EQ(dnnPoolingCreateForward_F32(
+                     &prim_pooling_fwd, primAttr, algorithm, lt_input_user,
+                     params.kernel_size, params.kernel_stride, params.in_offset,
+                     dnnBorderZerosAsymm),
+                 E_SUCCESS);
+      }
+
+      CHECK_EQ(dnnPoolingCreateBackward_F32(
+                   &prim_pooling_bwd, primAttr, algorithm, lt_input_user,
+                   params.kernel_size, params.kernel_stride, params.in_offset,
+                   dnnBorderZerosAsymm),
+               E_SUCCESS);
+
+      // Creates conversions
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                   &lt_outbackprop_prim, prim_pooling_bwd, dnnResourceDiffDst),
+               E_SUCCESS);
+
+      // Tensors needed to create temporary buffers
+      Tensor input_buf_tensor, outbackprop_buf_tensor;
+
+      if (workspace_enabled == false) {
+        CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                     &lt_input_prim, prim_pooling_fwd, dnnResourceSrc),
+                 E_SUCCESS);
+        if (!dnnLayoutCompare_F32(lt_input_user, lt_input_prim)) {
+          CHECK_EQ(dnnConversionCreate_F32(&convert_input, lt_input_user,
+                                           lt_input_prim),
+                   E_SUCCESS);
+          AllocTmpBuffer(context, &input_buf_tensor, lt_input_prim, &input_buf);
+        }
+      }
+
+      if (!dnnLayoutCompare_F32(lt_outbackprop_user, lt_outbackprop_prim)) {
+        CHECK_EQ(
+            dnnConversionCreate_F32(&convert_outbackprop, lt_outbackprop_user,
+                                    lt_outbackprop_prim),
+            E_SUCCESS);
+        AllocTmpBuffer(context, &outbackprop_buf_tensor, lt_outbackprop_prim,
+                       &outbackprop_buf);
+      }
+    }
+
+    // Compare incoming tensor layouts with MKL preferred layouts and convert
+    // data to the preferred layout if necessary
+    void MklPrepareInputs(OpKernelContext* context, bool workspace_enabled) {
+      const Tensor& tensor_in = MklGetInput(context, 0);
+      const Tensor& out_backprop = MklGetInput(context, 2);
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      bool outbackprop_in_mkl_format = output_backprop_shape.IsMklTensor();
+
+      void* tmp_output_buf;
+      Tensor tmp_output_buf_tensor;
+
+      void* workspace_buf;
+      Tensor workspace_buf_tensor;
+
+      if (workspace_enabled == false) {
+        if (convert_input != nullptr) {
+          if (input_in_mkl_format == false) {
+            CHECK_EQ(dnnConversionExecute_F32(
+                         convert_input,
+                         const_cast<void*>(static_cast<const void*>(
+                             tensor_in.flat<T>().data())),
+                         input_buf),
+                     E_SUCCESS);
+            CHECK_EQ(dnnDelete_F32(convert_input), E_SUCCESS);
+            convert_input = nullptr;
+          } else {
+            input_shape.GetConvertedFlatData(
+                lt_input_prim,
+                const_cast<void*>(
+                    static_cast<const void*>(tensor_in.flat<T>().data())),
+                input_buf);
+          }
+          pooling_resfwd[dnnResourceSrc] = input_buf;
+        } else {
+          pooling_resfwd[dnnResourceSrc] = const_cast<void*>(
+              static_cast<const void*>(tensor_in.flat<T>().data()));
+        }
+
+        dnnLayout_t lt_workspace;
+        CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                     &lt_workspace, prim_pooling_fwd, dnnResourceWorkspace),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, &workspace_buf_tensor, lt_workspace,
+                       &workspace_buf);
+        pooling_resfwd[dnnResourceWorkspace] = workspace_buf;
+
+        dnnLayoutDelete_F32(lt_workspace);
+
+        // We create the layout for max pooling fwd prop tmp output here
+        AllocTmpBuffer(context, &tmp_output_buf_tensor, lt_outbackprop_prim,
+                       &tmp_output_buf);
+        pooling_resfwd[dnnResourceDst] = tmp_output_buf;
+
+        CHECK_EQ(dnnExecute_F32(prim_pooling_fwd, pooling_resfwd), E_SUCCESS);
+        pooling_res[dnnResourceWorkspace] =
+            pooling_resfwd[dnnResourceWorkspace];
+      } else {
+        const Tensor& workspace = MklGetInput(context, 3);
+        pooling_res[dnnResourceWorkspace] = const_cast<void*>(
+            static_cast<const void*>(workspace.flat<T>().data()));
+      }
+
+      // Out backprop conversions if needed
+      if (convert_outbackprop != nullptr) {
+        if (outbackprop_in_mkl_format == false) {
+          CHECK_EQ(dnnConversionExecute_F32(
+                       convert_outbackprop,
+                       const_cast<void*>(static_cast<const void*>(
+                           out_backprop.flat<T>().data())),
+                       outbackprop_buf),
+                   E_SUCCESS);
+          CHECK_EQ(dnnDelete_F32(convert_outbackprop), E_SUCCESS);
+        } else {
+          output_backprop_shape.GetConvertedFlatData(
+              lt_outbackprop_prim,
+              const_cast<void*>(
+                  static_cast<const void*>(out_backprop.flat<T>().data())),
+              outbackprop_buf);
+        }
+        pooling_res[dnnResourceDiffDst] = outbackprop_buf;
+      } else {
+        pooling_res[dnnResourceDiffDst] = const_cast<void*>(
+            static_cast<const void*>(out_backprop.flat<T>().data()));
+      }
+    }
+
+    void MklCleanup(bool workspace_enabled) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      bool outbackprop_in_mkl_format = output_backprop_shape.IsMklTensor();
+      if (workspace_enabled == false) {
+        CHECK_EQ(dnnDelete_F32(prim_pooling_fwd), E_SUCCESS);
+      }
+      CHECK_EQ(dnnDelete_F32(prim_pooling_bwd), E_SUCCESS);
+      if (outbackprop_in_mkl_format == false) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_outbackprop_user), E_SUCCESS);
+      }
+      CHECK_EQ(dnnLayoutDelete_F32(lt_outbackprop_prim), E_SUCCESS);
+      if (input_in_mkl_format == false) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_input_user), E_SUCCESS);
+      }
+      if (workspace_enabled == false) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_input_prim), E_SUCCESS);
+      }
+    }
+  } MklMaxPoolingGradOpContext;
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+
+  bool workspace_enabled_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("MklMaxPool")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .Label(mkl_layer_registry::kMklLayerLabel),
+                        MklMaxPoolingOp<CPUDevice, float>);
+
+REGISTER_KERNEL_BUILDER(Name("MklMaxPoolGrad")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .Label(mkl_layer_registry::kMklLayerLabel),
+                        MklMaxPoolingGradOp<CPUDevice, float>);
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
new file mode 100644
index 0000000000..d88bd4c640
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -0,0 +1,150 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+#include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
+#include <vector>
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+
+namespace tensorflow {
+
+// Initialization for TensorFlow format
+void MklPoolParameters::Init(OpKernelContext* context,
+                             const std::vector<int32>& ksize,
+                             const std::vector<int32>& stride, Padding padding,
+                             TensorFormat data_format,
+                             const TensorShape& tensor_in_shape) {
+  // For maxpooling, tensor_in should have 4 dimensions.
+  OP_REQUIRES(context, tensor_in_shape.dims() == 4,
+              errors::InvalidArgument("tensor_in must be 4-dimensional"));
+
+  depth = GetTensorDim(tensor_in_shape, data_format, 'C');
+  tensor_in_cols = GetTensorDim(tensor_in_shape, data_format, 'W');
+  tensor_in_rows = GetTensorDim(tensor_in_shape, data_format, 'H');
+  tensor_in_batch = GetTensorDim(tensor_in_shape, data_format, 'N');
+
+  Init(context, ksize, stride, padding, data_format);
+}
+
+// Initialization for MKL format
+void MklPoolParameters::Init(OpKernelContext* context,
+                             const std::vector<int32>& ksize,
+                             const std::vector<int32>& stride, Padding padding,
+                             TensorFormat data_format,
+                             const MklShape* mklInputShape) {
+  // Get the input sizes
+  depth = mklInputShape->GetSizes()[2];
+  tensor_in_cols = mklInputShape->GetSizes()[0];
+  tensor_in_rows = mklInputShape->GetSizes()[1];
+  tensor_in_batch = mklInputShape->GetSizes()[3];
+
+  Init(context, ksize, stride, padding, data_format);
+}
+
+// Common Initialization for TensorFlow and MKL formats
+void MklPoolParameters::Init(OpKernelContext* context,
+                             const std::vector<int32>& ksize,
+                             const std::vector<int32>& stride, Padding padding,
+                             TensorFormat data_format) {
+  // Get the data format
+  this->data_format = data_format;
+
+  // Get the output sizes
+  window_rows = GetTensorDim(ksize, data_format, 'H');
+  window_cols = GetTensorDim(ksize, data_format, 'W');
+  depth_window = GetTensorDim(ksize, data_format, 'C');
+
+  // Get the strides
+  row_stride = GetTensorDim(stride, data_format, 'H');
+  col_stride = GetTensorDim(stride, data_format, 'W');
+  depth_stride = GetTensorDim(stride, data_format, 'C');
+
+  // We only support 2D pooling across width/height and depthwise
+  // pooling, not a combination.
+  OP_REQUIRES(context,
+              (depth_window == 1 || (window_rows == 1 && window_cols == 1)),
+              errors::Unimplemented(
+                  "MaxPooling supports exactly one of pooling across depth "
+                  "or pooling across width/height."));
+
+  if (depth_window == 1) {
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                tensor_in_rows, window_rows, row_stride,
+                                padding, &out_height, &pad_top, &pad_bottom));
+
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                tensor_in_cols, window_cols, col_stride,
+                                padding, &out_width, &pad_left, &pad_right));
+  } else {
+    // Our current version of depthwise max pooling does not support
+    // any padding, and expects the depth_window to equal the depth
+    // stride (no overlapping).
+    OP_REQUIRES(context, depth % depth_window == 0,
+                errors::Unimplemented("Depthwise max pooling requires the"
+                                      " depth window to evenly divide the"
+                                      " input depth"));
+    OP_REQUIRES(context, depth_stride == depth_window,
+                errors::Unimplemented("Depthwise max pooling requires the"
+                                      " depth window to equal the depth"
+                                      " stride"));
+
+    // The current version of depthwise max is only implemented on CPU.
+    OP_REQUIRES(context,
+                (DeviceType(static_cast<Device*>(context->device())
+                                ->attributes()
+                                .device_type()) == DeviceType(DEVICE_CPU)),
+                errors::Unimplemented("Depthwise max pooling is currently "
+                                      "only implemented for CPU devices."));
+
+    pad_depth = 0;
+    out_depth = depth / depth_window;
+  }
+}
+
+// Transfers the right parameters for pooling to the op parameters
+// Updates context->status if there is an invalid input.
+void ExtractMklOpParams(OpKernelContext* context, TensorFormat data_format,
+                        const MklPoolParameters& params,
+                        MklPoolingOpParams* mkl_params) {
+  mkl_params->in_sizes[0] = params.tensor_in_cols;
+  mkl_params->in_sizes[1] = params.tensor_in_rows;
+  mkl_params->in_sizes[2] = params.depth;
+  mkl_params->in_sizes[3] = params.tensor_in_batch;
+
+  GetStridesFromSizes(data_format, mkl_params->in_strides,
+                      mkl_params->in_sizes);
+
+  mkl_params->out_sizes[0] = params.out_width;
+  mkl_params->out_sizes[1] = params.out_height;
+  mkl_params->out_sizes[2] = params.depth;
+  mkl_params->out_sizes[3] = params.tensor_in_batch;
+
+  GetStridesFromSizes(data_format, mkl_params->out_strides,
+                      mkl_params->out_sizes);
+
+  mkl_params->in_offset[0] = -params.pad_left;
+  mkl_params->in_offset[1] = -params.pad_top;
+  mkl_params->in_offset[2] = -params.pad_right;
+  mkl_params->in_offset[3] = -params.pad_bottom;
+
+  mkl_params->kernel_stride[0] = params.col_stride;
+  mkl_params->kernel_stride[1] = params.row_stride;
+
+  mkl_params->kernel_size[0] = params.window_cols;
+  mkl_params->kernel_size[1] = params.window_rows;
+}
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
new file mode 100644
index 0000000000..92ea2beb25
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -0,0 +1,92 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_
+
+#ifdef INTEL_MKL
+#include <vector>
+#include "tensorflow/core/util/mkl_util.h"
+#include "tensorflow/core/util/padding.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+struct MklPoolParameters {
+  int depth;
+
+  int tensor_in_cols;
+  int tensor_in_rows;
+  int tensor_in_batch;
+
+  int window_rows;
+  int window_cols;
+  int depth_window;
+
+  int row_stride;
+  int col_stride;
+  int depth_stride;
+
+  int64 out_height;
+  int64 out_width;
+  int out_depth;
+
+  int64 pad_left;
+  int64 pad_right;
+  int64 pad_top;
+  int64 pad_bottom;
+  int pad_depth;
+
+  TensorFormat data_format;
+
+  // Updates context->status if there is an invalid input.
+  void Init(OpKernelContext* context, const std::vector<int32>& ksize,
+            const std::vector<int32>& stride, Padding padding,
+            TensorFormat data_format, const TensorShape& tensor_in_shape);
+  void Init(OpKernelContext* context, const std::vector<int32>& ksize,
+            const std::vector<int32>& stride, Padding padding,
+            TensorFormat data_format, const MklShape* mkl_in_shape);
+
+ private:
+  // Common initialization for TensorFlow and MKL formats
+  void Init(OpKernelContext* context, const std::vector<int32>& ksize,
+            const std::vector<int32>& stride, Padding padding,
+            TensorFormat data_format);
+};
+
+//-------------------------------------------------------------------
+// Utility functions
+
+typedef struct {
+  size_t in_dim;
+  size_t in_sizes[4];
+  size_t in_strides[4];
+  size_t out_sizes[4];
+  size_t out_strides[4];
+  int in_offset[4];
+  size_t kernel_stride[2];
+  size_t kernel_size[2];
+} MklPoolingOpParams;
+
+// Transfers the right parameters for pooling to the op parameters
+// Updates context->status if there is an invalid input.
+void ExtractMklOpParams(OpKernelContext* context, TensorFormat data_format,
+                        const MklPoolParameters& params,
+                        MklPoolingOpParams* mkl_params);
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
new file mode 100644
index 0000000000..7809711524
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -0,0 +1,397 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+#ifdef INTEL_MKL
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "tensorflow/core/platform/default/logging.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+struct MklReluHelpers {
+  static void ValidateSameSizeHelper(OpKernelContext* context, const Tensor& g,
+                                     const Tensor& a) {
+    OP_REQUIRES(context, a.IsSameSize(g),
+                errors::InvalidArgument("g and a must be the same size"));
+  }
+  static bool ValidateSameSize(OpKernelContext* context, const Tensor& g,
+                               const Tensor& a) {
+    ValidateSameSizeHelper(context, g, a);
+    return context->status().ok();
+  }
+};
+
+template <typename Device, typename T>
+class MklReluOp : public OpKernel {
+ public:
+  ~MklReluOp() {}
+
+  explicit MklReluOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    MklReluOpContext mkl_context;
+
+    const Tensor& input = MklGetInput(context, 0);
+    GetMklShape(context, 0, &mkl_context.input_shape);
+    void* user_i = static_cast<void*>(const_cast<T*>(input.flat<T>().data()));
+    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+    if (!input_in_mkl_format && !input.dims()) {  // handle the case of a scalar
+      const TensorShape& o_shape = input.shape();
+      Tensor* out_tensor = nullptr;
+      mkl_context.output_shape.SetMklTensor(false);
+      AllocateOutputSetMklshape(context, 0, &out_tensor, o_shape,
+                                mkl_context.output_shape);
+      void* out_o = static_cast<void*>(out_tensor->flat<T>().data());
+      (static_cast<T*>(out_o))[0] =
+          std::max((static_cast<T*>(user_i))[0], static_cast<T>(0));
+      return;
+    }
+
+    // Generate size, stride for input if input is in MKL format.
+    if (input_in_mkl_format) {
+      mkl_context.in_dims = mkl_context.input_shape.GetDimension();
+      mkl_context.in_sizes = new size_t[mkl_context.in_dims];
+      mkl_context.in_strides = new size_t[mkl_context.in_dims];
+      for (int i = 0; i < mkl_context.in_dims; i++) {
+        mkl_context.in_sizes[i] = mkl_context.input_shape.GetSizes()[i];
+        mkl_context.in_strides[i] = mkl_context.input_shape.GetStrides()[i];
+      }
+    } else {
+      mkl_context.in_dims = input.dims();
+      mkl_context.in_sizes = new size_t[mkl_context.in_dims];
+      mkl_context.in_strides = new size_t[mkl_context.in_dims];
+      for (int i = 0; i < mkl_context.in_dims; i++) {
+        mkl_context.in_sizes[i] = input.dim_size((mkl_context.in_dims - 1) - i);
+      }
+      mkl_context.in_strides[0] = 1;
+      for (int i = 1; i < mkl_context.in_dims; i++) {
+        mkl_context.in_strides[i] =
+            mkl_context.in_strides[i - 1] * mkl_context.in_sizes[i - 1];
+      }
+    }
+
+    float negative_slope = 0.0;
+    mkl_context.MklCreateInputLayouts(context);
+    CHECK_EQ(dnnReLUCreateForward_F32(&mkl_context.prim_relu_fwd, NULL,
+                                      mkl_context.lt_input, negative_slope),
+             E_SUCCESS);
+
+    Tensor* output = nullptr;
+
+    if (input_in_mkl_format) {
+      TensorShape tf_shape;
+      mkl_context.output_shape.SetMklTensor(true);
+      mkl_context.output_shape.SetMklLayout(mkl_context.prim_relu_fwd,
+                                            dnnResourceDst);
+      mkl_context.output_shape.SetTfLayout(
+          mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
+      mkl_context.output_shape.SetTfDimOrder(
+          mkl_context.in_dims, mkl_context.input_shape.GetTfToMklDimMap());
+      tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                          mkl_context.output_shape.GetMklLayout())) /
+                      sizeof(T));
+      AllocateOutputSetMklshape(context, 0, &output, tf_shape,
+                                mkl_context.output_shape);
+    } else {
+      const TensorShape& o_shape = input.shape();
+      mkl_context.output_shape.SetMklTensor(false);
+      AllocateOutputSetMklshape(context, 0, &output, o_shape,
+                                mkl_context.output_shape);
+    }
+
+    void* user_o = static_cast<void*>(const_cast<T*>(output->flat<T>().data()));
+
+    mkl_context.relu_res[dnnResourceDst] = user_o;
+    mkl_context.relu_res[dnnResourceSrc] = user_i;
+    CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_fwd, mkl_context.relu_res),
+             E_SUCCESS);
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    int in_dims;
+    size_t* in_sizes;
+    size_t* in_strides;
+    MklShape input_shape, output_shape;
+    dnnPrimitive_t prim_relu_fwd = nullptr;
+    void* relu_res[dnnResourceNumber];
+    dnnLayout_t lt_input = nullptr;
+
+    void MklCleanup() {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      if (!input_in_mkl_format) {
+        dnnLayoutDelete_F32(lt_input);
+        free(in_sizes);
+        free(in_strides);
+      }
+      dnnDelete_F32(prim_relu_fwd);
+    }
+
+    void MklCreateInputLayouts(OpKernelContext* context) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      if (!input_in_mkl_format) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
+      }
+    }
+  } MklReluOpContext;
+};
+
+template <typename Device, typename T>
+class MklReluGradOp : public OpKernel {
+ public:
+  ~MklReluGradOp() {}
+
+  explicit MklReluGradOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override;
+
+ private:
+  typedef struct {
+    int in_dims;
+    size_t* in_sizes;
+    size_t* in_strides;
+    MklShape input_shape, grad_shape, output_shape;
+    void* relu_res[dnnResourceNumber];
+    dnnPrimitive_t prim_relu_bwd;
+    dnnLayout_t lt_input, lt_grad;
+
+    void MklPrepareReluGradInputs(OpKernelContext* context,
+                                  Tensor* mkl_tmp_grad_buf_tensor,
+                                  Tensor* mkl_tmp_input_buf_tensor) {
+      dnnPrimitive_t cv_user_to_reluB_input, cv_user_to_reluB_grad;
+      dnnLayout_t mkl_lt_internal_input, mkl_lt_internal_grad;
+
+      const Tensor& g = MklGetInput(context, 0);
+      const Tensor& a = MklGetInput(context, 1);
+
+      void* user_i = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
+      void* user_g = static_cast<void*>(const_cast<T*>(g.flat<T>().data()));
+
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                   &mkl_lt_internal_grad, prim_relu_bwd, dnnResourceDiffDst),
+               E_SUCCESS);
+
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_input,
+                                                prim_relu_bwd, dnnResourceSrc),
+               E_SUCCESS);
+
+      if (!dnnLayoutCompare_F32(mkl_lt_internal_grad, lt_grad)) {
+        AllocTmpBuffer(context, mkl_tmp_grad_buf_tensor, mkl_lt_internal_grad,
+                       &relu_res[dnnResourceDiffDst]);
+        CHECK_EQ(dnnConversionCreate_F32(&cv_user_to_reluB_grad, lt_grad,
+                                         mkl_lt_internal_grad),
+                 E_SUCCESS);
+        CHECK_EQ(dnnConversionExecute_F32(cv_user_to_reluB_grad, user_g,
+                                          relu_res[dnnResourceDiffDst]),
+                 E_SUCCESS);
+        dnnDelete_F32(cv_user_to_reluB_grad);
+      } else {
+        relu_res[dnnResourceDiffDst] = user_g;
+      }
+
+      if (!dnnLayoutCompare_F32(mkl_lt_internal_input, lt_input)) {
+        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
+                       &relu_res[dnnResourceSrc]);
+        CHECK_EQ(dnnConversionCreate_F32(&cv_user_to_reluB_input, lt_input,
+                                         mkl_lt_internal_input),
+                 E_SUCCESS);
+        CHECK_EQ(dnnConversionExecute_F32(cv_user_to_reluB_input, user_i,
+                                          relu_res[dnnResourceSrc]),
+                 E_SUCCESS);
+        dnnDelete_F32(cv_user_to_reluB_input);
+      } else {
+        relu_res[dnnResourceSrc] = user_i;
+      }
+
+      dnnLayoutDelete_F32(mkl_lt_internal_input);
+      dnnLayoutDelete_F32(mkl_lt_internal_grad);
+    }
+
+    void MklCreateInputLayouts(OpKernelContext* context) {
+      bool grad_is_mkl = grad_shape.IsMklTensor();
+      bool input_is_mkl = input_shape.IsMklTensor();
+      if (!input_is_mkl) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
+      }
+
+      if (!grad_is_mkl) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_grad, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_grad = static_cast<dnnLayout_t>(grad_shape.GetCurLayout());
+      }
+    }
+
+    void MklCleanup() {
+      bool grad_is_mkl = grad_shape.IsMklTensor();
+      bool input_is_mkl = input_shape.IsMklTensor();
+      dnnDelete_F32(prim_relu_bwd);
+      if (!input_is_mkl) {
+        dnnLayoutDelete_F32(lt_input);
+        free(in_sizes);
+        free(in_strides);
+      }
+      if (!grad_is_mkl) {
+        dnnLayoutDelete_F32(lt_grad);
+      }
+    }
+  } MklReluGradOpContext;
+};
+
+template <typename Device, typename T>
+
+void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
+  MklReluGradOpContext mkl_context;
+  const Tensor& g = MklGetInput(context, 0);
+  const Tensor& a = MklGetInput(context, 1);
+
+  void* user_i = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
+  void* user_g = static_cast<void*>(const_cast<T*>(g.flat<T>().data()));
+
+  GetMklShape(context, 0, &mkl_context.grad_shape);
+  GetMklShape(context, 1, &mkl_context.input_shape);
+
+  bool grad_is_mkl = mkl_context.grad_shape.IsMklTensor();
+  bool input_is_mkl = mkl_context.input_shape.IsMklTensor();
+  if (!input_is_mkl && !grad_is_mkl &&
+      !MklReluHelpers::ValidateSameSize(context, g, a))
+    return;
+  Tensor* output = nullptr;
+  if (!input_is_mkl && !grad_is_mkl &&
+      !a.dims()) {  // handle the case of a scalar
+    // Allocate space for g and
+    const TensorShape& g_shape = g.shape();
+    mkl_context.output_shape.SetMklTensor(false);
+    AllocateOutputSetMklshape(context, 0, &output, g_shape,
+                              mkl_context.output_shape);
+    void* out_o = static_cast<void*>(output->flat<T>().data());
+    (static_cast<T*>(out_o))[0] =
+        (static_cast<T*>(user_g))[0] * ((static_cast<T*>(user_i))[0] > 0);
+    return;
+  }
+
+  // Generate size, stride for input if input/grad is in MKL format.
+  if (grad_is_mkl || input_is_mkl) {
+    const MklShape* tmp_mkl_shape =
+        (grad_is_mkl) ? &mkl_context.grad_shape : &mkl_context.input_shape;
+
+    mkl_context.in_dims = tmp_mkl_shape->GetDimension();
+    mkl_context.in_strides = new size_t[mkl_context.in_dims];
+    mkl_context.in_sizes = new size_t[mkl_context.in_dims];
+    for (int i = 0; i < mkl_context.in_dims; i++) {
+      mkl_context.in_sizes[i] = tmp_mkl_shape->GetSizes()[i];
+      mkl_context.in_strides[i] = tmp_mkl_shape->GetStrides()[i];
+    }
+  } else {
+    mkl_context.in_dims = g.dims();
+    mkl_context.in_strides = new size_t[mkl_context.in_dims];
+    mkl_context.in_sizes = new size_t[mkl_context.in_dims];
+
+    for (int i = 0; i < mkl_context.in_dims; i++) {
+      mkl_context.in_sizes[i] = g.dim_size((mkl_context.in_dims - 1) - i);
+    }
+    mkl_context.in_strides[0] = 1;
+    for (int i = 1; i < mkl_context.in_dims; i++) {
+      mkl_context.in_strides[i] =
+          mkl_context.in_strides[i - 1] * mkl_context.in_sizes[i - 1];
+    }
+  }
+
+  mkl_context.MklCreateInputLayouts(context);
+  float negative_slope = 0.0;
+  CHECK_EQ(dnnReLUCreateBackward_F32(&mkl_context.prim_relu_bwd, NULL,
+                                     mkl_context.lt_grad, mkl_context.lt_input,
+                                     negative_slope),
+           E_SUCCESS);
+  Tensor mkl_tmp_grad_buf_tensor, mkl_tmp_input_buf_tensor;
+  mkl_context.MklPrepareReluGradInputs(context, &mkl_tmp_grad_buf_tensor,
+                                       &mkl_tmp_input_buf_tensor);
+
+  if (input_is_mkl ||
+      grad_is_mkl) { /*if  grad or input are MKL leave it in MKL*/
+    TensorShape tf_shape;
+    mkl_context.output_shape.SetMklTensor(true);
+    mkl_context.output_shape.SetMklLayout(mkl_context.prim_relu_bwd,
+                                          dnnResourceDiffSrc);
+    mkl_context.output_shape.SetTfLayout(
+        mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
+    // If input_is_mkl or grad_is_mkl, then we copy strides and sizes from Mkl
+    // shape of one that is in MKL layout.
+    if (grad_is_mkl == true) {
+      mkl_context.output_shape.SetTfDimOrder(
+          mkl_context.in_dims, mkl_context.grad_shape.GetTfToMklDimMap());
+    } else {
+      mkl_context.output_shape.SetTfDimOrder(
+          mkl_context.in_dims, mkl_context.input_shape.GetTfToMklDimMap());
+    }
+
+    tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                        mkl_context.output_shape.GetMklLayout())) /
+                    sizeof(T));
+    AllocateOutputSetMklshape(context, 0, &output, tf_shape,
+                              mkl_context.output_shape);
+
+  } else {
+    const TensorShape& o_shape = g.shape();
+    mkl_context.output_shape.SetMklTensor(false);
+    AllocateOutputSetMklshape(context, 0, &output, o_shape,
+                              mkl_context.output_shape);
+  }
+
+  mkl_context.relu_res[dnnResourceDiffSrc] =
+      static_cast<void*>(output->flat<T>().data());
+
+  CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd, mkl_context.relu_res),
+           E_SUCCESS);
+  mkl_context.MklCleanup();
+}
+
+/* Register DNN kernels for supported operations and supported types - right now
+ * it is only Relu and f32*/
+#define REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES(type)                   \
+  REGISTER_KERNEL_BUILDER(Name("MklRelu")                                 \
+                              .Device(DEVICE_CPU)                         \
+                              .TypeConstraint<type>("T")                  \
+                              .Label(mkl_layer_registry::kMklLayerLabel), \
+                          MklReluOp<CPUDevice, type>);                    \
+  REGISTER_KERNEL_BUILDER(Name("MklReluGrad")                             \
+                              .Device(DEVICE_CPU)                         \
+                              .TypeConstraint<type>("T")                  \
+                              .Label(mkl_layer_registry::kMklLayerLabel), \
+                          MklReluGradOp<CPUDevice, type>);
+TF_CALL_float(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES);
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.cc b/tensorflow/core/kernels/mkl_tfconv_op.cc
index 35029152ec..51f90b3f90 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.cc
+++ b/tensorflow/core/kernels/mkl_tfconv_op.cc
@@ -67,28 +67,10 @@ class MklToTfOp : public OpKernel {
     CHECK_EQ(op_data_type, input_data_type);
     CHECK_EQ(op_data_type, output_data_type);
 
-    // We need to recreate Tf tensor shape based on sizes and strides.
-    // Ideally, we should know what the data_format is, but that attribute
-    // to this op is not reliable. So below, we rely of sorting logic where
-    // we sort strides first and then sizes.
     TensorShape output_shape;
-    std::vector<std::pair<int, int>> shape_size;
     for (size_t i = 0; i < input_shape.GetDimension(); i++) {
-      VLOG(1) << "Size: " << input_shape.GetSizes()[i]
-              << ", Strides: " << input_shape.GetStrides()[i];
-      shape_size.push_back(std::make_pair(input_shape.GetSizes()[i],
-                                          input_shape.GetStrides()[i]));
-    }
-
-    std::sort(shape_size.begin(), shape_size.end(),
-              [](std::pair<int, int> a, std::pair<int, int> b) {
-                return (a.second > b.second) ||
-                       (a.second == b.second && a.first > b.first);
-              });
-
-    for (std::pair<int, int> s_s : shape_size) {
-      VLOG(1) << "Added dimension: " << s_s.first;
-      output_shape.AddDim(s_s.first);
+      // Outermost to innermost dimension
+      output_shape.AddDim(input_shape.GetSizes()[input_shape.tf_dim_idx(i)]);
     }
 
     // Allocate output tensor.
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index f12c18eaa8..538dca24ae 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -14,12 +14,15 @@ limitations under the License.
 ==============================================================================*/
 #define EIGEN_USE_THREADS
 
+#include "tensorflow/core/kernels/pooling_ops_3d.h"
+
 #include <array>
 
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
@@ -28,15 +31,64 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/work_sharder.h"
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/kernels/cudnn_pooling_gpu.h"
+#include "tensorflow/core/kernels/pooling_ops_3d_gpu.h"
 #endif
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
+Pool3dParameters::Pool3dParameters(OpKernelContext* context,
+                                   const std::vector<int32>& ksize,
+                                   const std::vector<int32>& stride,
+                                   Padding padding, TensorFormat data_format,
+                                   const TensorShape& tensor_in_shape) {
+  // For maxpooling, tensor_in should have 4 dimensions.
+  OP_REQUIRES(context, tensor_in_shape.dims() == 5,
+              errors::InvalidArgument("tensor_in must be 4-dimensional"));
+
+  this->data_format = data_format;
+  depth = GetTensorDim(tensor_in_shape, data_format, 'C');
+  tensor_in_planes = GetTensorDim(tensor_in_shape, data_format, '0');
+  tensor_in_rows = GetTensorDim(tensor_in_shape, data_format, '1');
+  tensor_in_cols = GetTensorDim(tensor_in_shape, data_format, '2');
+  tensor_in_batch = GetTensorDim(tensor_in_shape, data_format, 'N');
+  window_planes = GetTensorDim(ksize, data_format, '0');
+  window_rows = GetTensorDim(ksize, data_format, '1');
+  window_cols = GetTensorDim(ksize, data_format, '2');
+  depth_window = GetTensorDim(ksize, data_format, 'C');
+  plane_stride = GetTensorDim(stride, data_format, '0');
+  row_stride = GetTensorDim(stride, data_format, '1');
+  col_stride = GetTensorDim(stride, data_format, '2');
+  depth_stride = GetTensorDim(stride, data_format, 'C');
+
+  // We only support 3D pooling across plane/width/height. Depthwise
+  // pooling is not supported.
+  OP_REQUIRES(
+      context, depth_window == 1 && depth_stride == 1,
+      errors::Unimplemented(
+          "Pooling3d only supports pooling across plane/width/height."));
+
+  OP_REQUIRES_OK(context, GetWindowedOutputSize(tensor_in_planes, window_planes,
+                                                plane_stride, padding,
+                                                &out_plane, &pad_planes));
+  OP_REQUIRES_OK(context,
+                 GetWindowedOutputSize(tensor_in_rows, window_rows, row_stride,
+                                       padding, &out_height, &pad_rows));
+  OP_REQUIRES_OK(context,
+                 GetWindowedOutputSize(tensor_in_cols, window_cols, col_stride,
+                                       padding, &out_width, &pad_cols));
+}
+
+TensorShape Pool3dParameters::forward_output_shape() {
+  return ShapeFromFormat(data_format, tensor_in_batch,
+                         {{out_plane, out_height, out_width}}, depth);
+}
+
 enum PoolingType { MAX, AVG };
 
 template <typename Device, typename T, PoolingType Type>
@@ -147,12 +199,6 @@ class Pooling3DOp : public UnaryOp<T> {
   Padding padding_;
   TensorFormat data_format_;
 };
-REGISTER_KERNEL_BUILDER(
-    Name("AvgPool3D").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    Pooling3DOp<CPUDevice, float, AVG>);
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool3D").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    Pooling3DOp<CPUDevice, float, MAX>);
 
 template <typename Device, typename T>
 struct LaunchMaxPooling3dGradOp;
@@ -331,10 +377,6 @@ class MaxPooling3dGradOp : public OpKernel {
   TensorFormat data_format_;
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool3DGrad").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    MaxPooling3dGradOp<CPUDevice, float>);
-
 template <typename Device, typename T>
 struct LaunchAvgPooling3dGradOp;
 
@@ -499,11 +541,208 @@ class AvgPooling3dGradOp : public OpKernel {
   TensorFormat data_format_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("AvgPool3DGrad")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T")
-                            .HostMemory("orig_input_shape"),
-                        AvgPooling3dGradOp<CPUDevice, float>);
+template <typename Device, typename T>
+struct LaunchMaxPooling3dGradGradOp;
+
+template <typename T>
+struct LaunchMaxPooling3dGradGradOp<CPUDevice, T> {
+  static void launch(OpKernelContext* context, const Pool3dParameters& params,
+                     const Tensor& tensor_in, const Tensor& tensor_out,
+                     const Tensor& tensor_top_diff,
+                     Tensor* tensor_bottom_diff) {
+    OP_REQUIRES(
+        context, params.data_format == FORMAT_NHWC,
+        errors::InvalidArgument("Default MaxPooling3dGradGradOp only supports",
+                                "NDHWC on CPU device type"));
+
+    typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+        ConstEigenMatrixMap;
+    typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+        EigenMatrixMap;
+
+    ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth,
+                               params.tensor_in_planes * params.tensor_in_cols *
+                                   params.tensor_in_rows *
+                                   params.tensor_in_batch);
+    ConstEigenMatrixMap out_mat(tensor_out.flat<T>().data(), params.depth,
+                                params.out_plane * params.out_width *
+                                    params.out_height * params.tensor_in_batch);
+    ConstEigenMatrixMap top_diff_mat(
+        tensor_top_diff.flat<T>().data(), params.depth,
+        params.tensor_in_planes * params.tensor_in_cols *
+            params.tensor_in_rows * params.tensor_in_batch);
+    EigenMatrixMap bottom_diff_mat(
+        tensor_bottom_diff->flat<T>().data(), params.depth,
+        params.out_plane * params.out_width * params.out_height *
+            params.tensor_in_batch);
+
+    const DeviceBase::CpuWorkerThreads& worker_threads =
+        *(context->device()->tensorflow_cpu_worker_threads());
+
+    auto shard = [&params, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
+                     int64 start, int64 limit) {
+      const int32 depth = params.depth;
+      const int32 in_planes = params.tensor_in_planes;
+      const int32 in_rows = params.tensor_in_rows;
+      const int32 in_cols = params.tensor_in_cols;
+      const int32 pad_planes = params.pad_planes;
+      const int32 pad_rows = params.pad_rows;
+      const int32 pad_cols = params.pad_cols;
+      const int32 window_planes = params.window_planes;
+      const int32 window_rows = params.window_rows;
+      const int32 window_cols = params.window_cols;
+      const int32 plane_stride = params.plane_stride;
+      const int32 row_stride = params.row_stride;
+      const int32 col_stride = params.col_stride;
+      const int32 out_plane = params.out_plane;
+      const int32 out_height = params.out_height;
+      const int32 out_width = params.out_width;
+
+      {
+        // Initializes the output grad backprop tensor with 0.
+        const int32 output_image_size =
+            out_plane * out_height * out_width * params.depth;
+        EigenMatrixMap bottom_diff_shard(
+            bottom_diff_mat.data() + start * output_image_size, 1,
+            (limit - start) * output_image_size);
+        bottom_diff_shard.setZero();
+      }
+
+      for (int b = start; b < limit; ++b) {
+        for (int pp = 0; pp < out_plane; ++pp) {
+          for (int ph = 0; ph < out_height; ++ph) {
+            for (int pw = 0; pw < out_width; ++pw) {
+              // (p_start, p_end) * (h_start, h_end) * (w_start, w_end) is the
+              // range that the input vector projects to.
+              int p_start = pp * plane_stride - pad_planes;
+              const int p_end = std::min(p_start + window_planes, in_planes);
+              int h_start = ph * row_stride - pad_rows;
+              const int h_end = std::min(h_start + window_rows, in_rows);
+              int w_start = pw * col_stride - pad_cols;
+              const int w_end = std::min(w_start + window_cols, in_cols);
+              p_start = std::max(p_start, 0);
+              h_start = std::max(h_start, 0);
+              w_start = std::max(w_start, 0);
+              const int out_index =
+                  ((b * out_plane + pp) * out_height + ph) * out_width + pw;
+              // Find value corresponding to the input maximum in top_diff.
+              for (int d = 0; d < depth; ++d) {
+                const T& output_ref = out_mat.coeffRef(d, out_index);
+                bool should_stop = false;
+                for (int p = p_start; p < p_end && !should_stop; ++p) {
+                  for (int h = h_start; h < h_end && !should_stop; ++h) {
+                    for (int w = w_start; w < w_end && !should_stop; ++w) {
+                      const int in_index =
+                          ((b * in_planes + p) * in_rows + h) * in_cols + w;
+                      const T& input_ref = in_mat.coeffRef(d, in_index);
+                      if (output_ref == input_ref) {
+                        T& bottom_diff_ref =
+                            bottom_diff_mat.coeffRef(d, out_index);
+                        bottom_diff_ref = top_diff_mat.coeffRef(d, in_index);
+                        should_stop = true;
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    };
+    const int64 shard_cost =
+        params.out_plane * params.out_height * params.out_width * params.depth *
+        params.window_planes * params.window_rows * params.window_cols;
+    Shard(worker_threads.num_threads, worker_threads.workers,
+          params.tensor_in_batch, shard_cost, shard);
+  }
+};
+
+template <class Device, class T>
+class MaxPooling3dGradGradOp : public OpKernel {
+ public:
+  explicit MaxPooling3dGradGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+    OP_REQUIRES(context, ksize_.size() == 5,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 5,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+    const int32 ksize_c = GetTensorDim(ksize_, data_format_, 'C');
+    const int32 stride_c = GetTensorDim(stride_, data_format_, 'C');
+    OP_REQUIRES(context, ksize_c == 1 && stride_c == 1,
+                errors::Unimplemented("MaxPooling3dGradGrad is not yet "
+                                      "supported on the depth dimension."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+    const Tensor& tensor_out = context->input(1);
+    const Tensor& out_grad_backprop = context->input(2);
+
+    // For maxpooling3d, tensor_in should have 5 dimensions.
+    OP_REQUIRES(context, tensor_in.dims() == 5,
+                errors::InvalidArgument("tensor_in must be 5-dimensional"));
+    OP_REQUIRES(context, tensor_out.dims() == 5,
+                errors::InvalidArgument("tensor_out must be 5-dimensional"));
+    // For maxpooling3d, out_grad_backprop should have 5 dimensions.
+    OP_REQUIRES(
+        context, out_grad_backprop.dims() == 5,
+        errors::InvalidArgument("out_grad_backprop must be 5-dimensional"));
+
+    Pool3dParameters params{context,  ksize_,       stride_,
+                            padding_, data_format_, tensor_in.shape()};
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {2}, 0, tensor_out.shape(), &output));
+
+    LaunchMaxPooling3dGradGradOp<Device, T>::launch(
+        context, params, tensor_in, tensor_out, out_grad_backprop, output);
+  }
+
+ private:
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+};
+
+#define REGISTER_KERNELS(D, T)                                             \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("MaxPool3D").Device(DEVICE_##D).TypeConstraint<T>("T"),         \
+      Pooling3DOp<D##Device, T, MAX>);                                     \
+  REGISTER_KERNEL_BUILDER(Name("MaxPool3DGrad")                            \
+                              .Device(DEVICE_##D)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .TypeConstraint<T>("TInput"),                \
+                          MaxPooling3dGradOp<D##Device, T>);               \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("MaxPool3DGradGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      MaxPooling3dGradGradOp<D##Device, T>);                               \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("AvgPool3D").Device(DEVICE_##D).TypeConstraint<T>("T"),         \
+      Pooling3DOp<D##Device, T, AVG>);                                     \
+  REGISTER_KERNEL_BUILDER(Name("AvgPool3DGrad")                            \
+                              .Device(DEVICE_##D)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .HostMemory("orig_input_shape"),             \
+                          AvgPooling3dGradOp<D##Device, T>);
+
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T)
+TF_CALL_float(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
 
 #if GOOGLE_CUDA
 
@@ -535,13 +774,6 @@ struct LaunchPoolingOp<GPUDevice, T, MAX> {
   }
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("AvgPool3D").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    Pooling3DOp<GPUDevice, float, AVG>);
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool3D").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    Pooling3DOp<GPUDevice, float, MAX>);
-
 template <typename T>
 struct LaunchMaxPooling3dGradOp<GPUDevice, T> {
   static void launch(OpKernelContext* context, const Tensor& tensor_in,
@@ -559,10 +791,6 @@ struct LaunchMaxPooling3dGradOp<GPUDevice, T> {
   }
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("MaxPool3DGrad").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    MaxPooling3dGradOp<GPUDevice, float>);
-
 template <typename T>
 struct LaunchAvgPooling3dGradOp<GPUDevice, T> {
   static void launch(OpKernelContext* context,
@@ -579,12 +807,36 @@ struct LaunchAvgPooling3dGradOp<GPUDevice, T> {
         nullptr, nullptr, output);
   }
 };
-REGISTER_KERNEL_BUILDER(Name("AvgPool3DGrad")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<float>("T")
-                            .HostMemory("orig_input_shape"),
-                        AvgPooling3dGradOp<GPUDevice, float>);
+
+template <typename T>
+struct LaunchMaxPooling3dGradGradOp<GPUDevice, T> {
+  static void launch(OpKernelContext* context, const Pool3dParameters& params,
+                     const Tensor& tensor_in, const Tensor& tensor_out,
+                     const Tensor& tensor_top_diff,
+                     Tensor* tensor_bottom_diff) {
+    bool status = functor::MaxPool3dGradBackward<T>()(
+        params.data_format, tensor_in.flat<T>().data(),
+        tensor_out.flat<T>().data(), params.tensor_in_batch, params.out_plane,
+        params.out_height, params.out_width, params.depth,
+        params.tensor_in_planes, params.tensor_in_rows, params.tensor_in_cols,
+        params.window_planes, params.window_rows, params.window_cols,
+        params.plane_stride, params.row_stride, params.col_stride,
+        params.pad_planes, params.pad_rows, params.pad_cols,
+        tensor_top_diff.flat<T>().data(), tensor_bottom_diff->flat<T>().data(),
+        context->eigen_gpu_device());
+    if (!status) {
+      context->SetStatus(
+          errors::Internal("Failed launching MaxPool3dGradBackward"));
+    }
+  }
+};
+
+#define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T)
+TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS)
+#undef REGISTER_GPU_KERNELS
 
 #endif  // GOOGLE_CUDA
 
+#undef REGISTER_KERNELS
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/pooling_ops_3d.h b/tensorflow/core/kernels/pooling_ops_3d.h
new file mode 100644
index 0000000000..7954e2cf83
--- /dev/null
+++ b/tensorflow/core/kernels/pooling_ops_3d.h
@@ -0,0 +1,66 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_POOLING_OPS_3D_H_
+#define TENSORFLOW_KERNELS_POOLING_OPS_3D_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+// A helper class to manage sizes and shapes for 3d pooling operations.
+struct Pool3dParameters {
+  // Updates context->status if there is an invalid input.
+  Pool3dParameters(OpKernelContext* context, const std::vector<int32>& ksize,
+                   const std::vector<int32>& stride, Padding padding,
+                   TensorFormat data_format,
+                   const TensorShape& tensor_in_shape);
+
+  // Returns the shape of the output for "forward" pooling operations.
+  TensorShape forward_output_shape();
+
+  int depth;
+
+  int tensor_in_planes;
+  int tensor_in_cols;
+  int tensor_in_rows;
+  int tensor_in_batch;
+
+  int window_planes;
+  int window_cols;
+  int window_rows;
+  int depth_window;
+
+  int plane_stride;
+  int col_stride;
+  int row_stride;
+  int depth_stride;
+
+  int64 out_plane;
+  int64 out_height;
+  int64 out_width;
+
+  int64 pad_planes;
+  int64 pad_cols;
+  int64 pad_rows;
+
+  TensorFormat data_format;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_POOLING_OPS_3D_H_
diff --git a/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc b/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc
new file mode 100644
index 0000000000..341a43c368
--- /dev/null
+++ b/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc
@@ -0,0 +1,172 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/pooling_ops_3d_gpu.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+namespace {
+
+template <typename dtype>
+__global__ void MaxPoolGradBackwardNoMaskNCDHW(
+    const int nthreads, const dtype* bottom_data, const dtype* output_data,
+    const int pooled_plane, const int pooled_height, const int pooled_width,
+    const int channels, const int plane, const int height, const int width,
+    const int kernel_p, const int kernel_h, const int kernel_w,
+    const int stride_p, const int stride_h, const int stride_w, const int pad_p,
+    const int pad_t, const int pad_l, const dtype* top_diff,
+    dtype* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // First find out the index to the maximum, since we have no mask.
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int pp = (index / pooled_width / pooled_height) % pooled_plane;
+    int c = (index / pooled_width / pooled_height / pooled_plane) % channels;
+    int n = (index / pooled_width / pooled_height / pooled_plane / channels);
+    int pstart = pp * stride_p - pad_p;
+    int hstart = ph * stride_h - pad_t;
+    int wstart = pw * stride_w - pad_l;
+    const int pend = min(pstart + kernel_p, plane);
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    pstart = max(pstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    bool should_stop = false;
+    int maxidx = -1;
+    const dtype* bottom_data_n =
+        bottom_data + n * channels * plane * height * width;
+    // Propagate only first value from top_diff corresponding to the maximum.
+    for (int p = pstart; p < pend && !should_stop; ++p) {
+      for (int h = hstart; h < hend && !should_stop; ++h) {
+        for (int w = wstart; w < wend && !should_stop; ++w) {
+          int idx = c * plane * height * width + (p * height + h) * width + w;
+          if (output_data[index] == bottom_data_n[idx]) {
+            maxidx = idx;
+            should_stop = true;
+          }
+        }
+      }
+    }
+    // Set the bottom diff (atomic is not necessary). The index could still be
+    // uninitialized, if all the bottom_data are NaN.
+    if (maxidx != -1) {
+      bottom_diff[index] =
+          top_diff[n * channels * plane * height * width + maxidx];
+    }
+  }
+}
+
+template <typename dtype>
+__global__ void MaxPoolGradBackwardNoMaskNDHWC(
+    const int nthreads, const dtype* bottom_data, const dtype* output_data,
+    const int pooled_plane, const int pooled_height, const int pooled_width,
+    const int channels, const int plane, const int height, const int width,
+    const int kernel_p, const int kernel_h, const int kernel_w,
+    const int stride_p, const int stride_h, const int stride_w, const int pad_p,
+    const int pad_t, const int pad_l, const dtype* top_diff,
+    dtype* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // First find out the index to the maximum, since we have no mask.
+    int n = index;
+    int c = n % channels;
+    n /= channels;
+    int wstart = (n % pooled_width) * stride_w - pad_l;
+    int wend = min(wstart + kernel_w, width);
+    wstart = max(wstart, 0);
+    n /= pooled_width;
+    int hstart = (n % pooled_height) * stride_h - pad_t;
+    int hend = min(hstart + kernel_h, height);
+    hstart = max(hstart, 0);
+    n /= pooled_height;
+    int pstart = (n % pooled_plane) * stride_p - pad_p;
+    int pend = min(pstart + kernel_p, plane);
+    pstart = max(pstart, 0);
+    n /= pooled_plane;
+    bool should_stop = false;
+    int maxidx = -1;
+    const dtype* bottom_data_n =
+        bottom_data + n * plane * height * width * channels;
+    // Propagate only first value from top_diff corresponding to the maximum.
+    for (int p = pstart; p < pend && !should_stop; ++p) {
+      for (int h = hstart; h < hend && !should_stop; ++h) {
+        for (int w = wstart; w < wend && !should_stop; ++w) {
+          int idx = ((p * height + h) * width + w) * channels + c;
+          if (output_data[index] == bottom_data_n[idx]) {
+            maxidx = idx;
+            should_stop = true;
+          }
+        }
+      }
+    }
+    // Set the bottom diff (atomic is not necessary). The index could still be
+    // uninitialized, if all the bottom_data are NaN.
+    if (maxidx != -1) {
+      bottom_diff[index] =
+          top_diff[n * plane * height * width * channels + maxidx];
+    }
+  }
+}
+
+}  // namespace
+
+namespace functor {
+
+template <typename T>
+bool MaxPool3dGradBackward<T>::operator()(
+    TensorFormat data_format, const T* bottom_data, const T* output_data,
+    const int batch, const int pooled_plane, const int pooled_height,
+    const int pooled_width, const int channels, const int plane,
+    const int height, const int width, const int kernel_p, const int kernel_h,
+    const int kernel_w, const int stride_p, const int stride_h,
+    const int stride_w, const int pad_p, const int pad_t, const int pad_l,
+    const T* top_diff, T* bottom_diff, const Eigen::GpuDevice& d) {
+  int num_kernels =
+      batch * channels * pooled_plane * pooled_height * pooled_width;
+  CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
+  if (data_format == FORMAT_NHWC) {
+    MaxPoolGradBackwardNoMaskNDHWC<<<config.block_count,
+                                     config.thread_per_block, 0, d.stream()>>>(
+        num_kernels, bottom_data, output_data, pooled_plane, pooled_height,
+        pooled_width, channels, plane, height, width, kernel_p, kernel_h,
+        kernel_w, stride_p, stride_h, stride_w, pad_p, pad_t, pad_l, top_diff,
+        bottom_diff);
+  } else {
+    MaxPoolGradBackwardNoMaskNCDHW<<<config.block_count,
+                                     config.thread_per_block, 0, d.stream()>>>(
+        num_kernels, bottom_data, output_data, pooled_plane, pooled_height,
+        pooled_width, channels, plane, height, width, kernel_p, kernel_h,
+        kernel_w, stride_p, stride_h, stride_w, pad_p, pad_t, pad_l, top_diff,
+        bottom_diff);
+  }
+  return d.ok();
+}
+
+}  // namespace functor
+
+#define DEFINE_GPU_SPECS(T) template struct functor::MaxPool3dGradBackward<T>;
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
+#undef DEFINE_GPU_SPECS
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/pooling_ops_3d_gpu.h b/tensorflow/core/kernels/pooling_ops_3d_gpu.h
new file mode 100644
index 0000000000..350b1b6732
--- /dev/null
+++ b/tensorflow/core/kernels/pooling_ops_3d_gpu.h
@@ -0,0 +1,48 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if !GOOGLE_CUDA
+#error This file must only be included when building with Cuda support
+#endif
+
+#ifndef TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_GPU_H_
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+namespace functor {
+template <typename T>
+struct MaxPool3dGradBackward {
+  bool operator()(TensorFormat data_format, const T* bottom_data,
+                  const T* output_data, const int batch, const int pooled_plane,
+                  const int pooled_height, const int pooled_width,
+                  const int channels, const int plane, const int height,
+                  const int width, const int kernel_p, const int kernel_h,
+                  const int kernel_w, const int stride_p, const int stride_h,
+                  const int stride_w, const int pad_p, const int pad_t,
+                  const int pad_l, const T* top_diff, T* bottom_diff,
+                  const Eigen::GpuDevice& d);
+};
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_H_
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index 3fe16c66b8..37747a3199 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 
 #if GOOGLE_CUDA
@@ -127,8 +128,7 @@ namespace functor {
       typename TTypes<T, 4>::Tensor out);                           \
   extern template struct TransformDepth<GPUDevice, T, Eigen::DenseIndex>;
 
-DECLARE_GPU_SPEC(float);
-DECLARE_GPU_SPEC(Eigen::half);
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC)
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
@@ -373,10 +373,11 @@ void DnnPoolingGradOp<T>::Compute(
   }
 }
 
-template class DnnPoolingOp<Eigen::half>;
-template class DnnPoolingOp<float>;
-template class DnnPoolingGradOp<Eigen::half>;
-template class DnnPoolingGradOp<float>;
+#define DEFINE_DNN_OPS(T)         \
+  template class DnnPoolingOp<T>; \
+  template class DnnPoolingGradOp<T>;
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_DNN_OPS)
+#undef DEFINE_DNN_OPS
 
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
index f3c7e0f26b..3063fedac8 100644
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@@ -303,10 +303,7 @@ class RandomGammaOp : public OpKernel {
                                                       &samples_shape));
     }
     const int64 num_samples = samples_shape.num_elements();
-    OP_REQUIRES(ctx, num_samples > 0,
-                errors::InvalidArgument(
-                    "Input shape should have non-zero element count, got: ",
-                    num_samples));
+    if (num_samples == 0) return;
 
     samples_shape.AppendShape(alpha_t.shape());
     // Allocate output samples.
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 2ed0522ce4..46e743b4cf 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -837,15 +837,6 @@ class SparseMatMul {
 };
 
 #ifdef TENSORFLOW_USE_LIBXSMM
-#ifdef EXTRA_CACHE_LOGGING
-static tensorflow::mutex global_cache_stats_lock;
-static int total_num_entries_outstanding GUARDED_BY(global_cache_stats_lock) =
-    0;
-static int total_num_entries_in_cache GUARDED_BY(global_cache_stats_lock) = 0;
-#endif  // EXTRA_CACHE_LOGGING
-
-static const int max_entries_per_graph_node = 40;
-
 template <typename TL, typename TR>
 class LibxsmmSparseMatMul {
   typedef Eigen::Tensor<TL, 2, Eigen::RowMajor> MatrixL;
@@ -861,7 +852,6 @@ class LibxsmmSparseMatMul {
       MatrixMapR;
 
  public:
-#if 1
   // This structure contains a set of libxsmm kernels for sizes that have been
   // encountered previously by this operator so that libxsmm does not need to
   // reallocate its scratchpad memory each time (which hurts performance
@@ -880,181 +870,57 @@ class LibxsmmSparseMatMul {
       // useful (it is an empty struct right now)
       typename SparseMatMul<TL, TR>::TensorInfoCache
           non_libxsmm_cache;  // Currently not used
-      TF_DISALLOW_COPY_AND_ASSIGN(TensorInfoCacheEntry);
-      ~TensorInfoCacheEntry() {
-#ifdef EXTRA_CACHE_LOGGING
-        LOG(INFO) << "Deleting tensor cache entry at " << (void*)this;
-#endif  // EXTRA_CACHE_LOGGING
-        libxsmm_spmdm_destroy(&handle);
-      }
     };
-    // protects entries; invariant: entries is a valid std::list.
+    // protects entries; invariant: entries is a valid std::multimap
     tensorflow::mutex lock;
     // Because there could be multiple matrix multiplies with the same sizes
     // going on at the same time, we need to allow multiple cache entries for a
     // given set of parameters. Taking and returning entries is used to make
     // sure the same cache entry is not used from two threads at a time.
-    using entries_map_type = std::list<std::pair<
-        std::tuple<int, int, int, int>,
-        std::unique_ptr<TensorInfoCacheEntry>>>;  // multimap in LRU order
-    entries_map_type entries GUARDED_BY(
-        lock);  // MRU element at end so reverse search will find it first
-    int num_entries_outstanding GUARDED_BY(lock);
-
-    TensorInfoCache() : lock(), entries(), num_entries_outstanding(0) {}
+    std::multimap<std::tuple<int, int, int, int>,
+                  std::unique_ptr<TensorInfoCacheEntry>>
+        entries GUARDED_BY(lock);
+
+    TensorInfoCache() : lock(), entries() {}
     // Look up and remove first entry with these parameters, creating one if
     // there isn't one
     std::unique_ptr<TensorInfoCacheEntry> take_cache_entry(int M, int K, int N,
                                                            int max_threads)
-#ifdef EXTRA_CACHE_LOGGING
-        LOCKS_EXCLUDED(lock, global_cache_stats_lock)
-#else
-        LOCKS_EXCLUDED(lock)
-#endif
-    {
+        LOCKS_EXCLUDED(lock) {
       tensorflow::mutex_lock ml(lock);
-#ifdef EXTRA_CACHE_LOGGING
-      tensorflow::mutex_lock ml2(global_cache_stats_lock);
-#endif
       auto key = std::make_tuple(M, K, N, max_threads);
-      auto it_rev =
-          std::find_if(entries.rbegin(), entries.rend(),
-                       [&](const typename entries_map_type::value_type& e) {
-                         return e.first == key;
-                       });
-      auto it =
-          (it_rev == entries.rend() ? entries.end() : std::next(it_rev).base());
+      auto it = entries.find(key);
       if (it != entries.end()) {
         auto val = std::move(it->second);
         entries.erase(it);
-        ++num_entries_outstanding;
-#ifdef EXTRA_CACHE_LOGGING
-        ++total_num_entries_outstanding;
-        --total_num_entries_in_cache;
-        LOG(INFO) << "Used existing cache entry at " << (void*)val.get()
-                  << " for " << M << "x" << K << "x" << N << " max_threads "
-                  << max_threads
-                  << ", num_entries_outstanding = " << num_entries_outstanding
-                  << ", new cache size = " << entries.size()
-                  << ", total num_entries_outstanding = "
-                  << total_num_entries_outstanding
-                  << ", total cache size = " << total_num_entries_in_cache;
-#endif
         return val;
       } else {
-        while (!entries.empty() &&
-               entries.size() + num_entries_outstanding + 1 >
-                   max_entries_per_graph_node) {
-#ifdef EXTRA_CACHE_LOGGING
-          LOG(INFO) << "Removing old cache entry at "
-                    << (void*)entries.front().second.get();
-#endif
-          entries.pop_front();
-        }
         std::unique_ptr<TensorInfoCacheEntry> e{
             new TensorInfoCacheEntry{M, K, N, max_threads, {}, nullptr}};
         // setup scoped allocator, which uses cpu_allocator() for this scope
         const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator;
         libxsmm_spmdm_init(M, N, K, max_threads, &e->handle, &e->output_csr);
-        ++num_entries_outstanding;
-#ifdef EXTRA_CACHE_LOGGING
-        ++total_num_entries_outstanding;
-        LOG(INFO) << "Created cache entry at " << (void*)e.get() << " for " << M
-                  << "x" << K << "x" << N << " max_threads " << max_threads
-                  << ", num_entries_outstanding = " << num_entries_outstanding
-                  << ", new cache size = " << entries.size()
-                  << ", total num_entries_outstanding = "
-                  << total_num_entries_outstanding
-                  << ", total cache size = " << total_num_entries_in_cache;
-#endif
         return e;
       }
     }
     // Add a cache entry with certain parameters
     void return_cache_entry(std::unique_ptr<TensorInfoCacheEntry> e)
-#ifdef EXTRA_CACHE_LOGGING
-        LOCKS_EXCLUDED(lock, global_cache_stats_lock)
-#else
-        LOCKS_EXCLUDED(lock)
-#endif
-    {
+        LOCKS_EXCLUDED(lock) {
       tensorflow::mutex_lock ml(lock);
-#ifdef EXTRA_CACHE_LOGGING
-      tensorflow::mutex_lock ml2(global_cache_stats_lock);
-#endif
       auto key = std::make_tuple(e->M, e->K, e->N, e->max_threads);
-      --num_entries_outstanding;
-#ifdef EXTRA_CACHE_LOGGING
-      --total_num_entries_outstanding;
-      LOG(INFO) << "Returned cache entry at " << (void*)e.get() << " for "
-                << e->M << "x" << e->K << "x" << e->N << " max_threads "
-                << e->max_threads
-                << ", num_entries_outstanding = " << num_entries_outstanding
-                << ", prev cache size = " << entries.size()
-                << ", total num_entries_outstanding = "
-                << total_num_entries_outstanding
-                << ", total cache size = " << total_num_entries_in_cache;
-#endif
-      entries.push_back(std::make_pair(key, std::move(e)));
-#ifdef EXTRA_CACHE_LOGGING
-      ++total_num_entries_in_cache;
-#endif
+      entries.insert(std::make_pair(key, std::move(e)));
     }
     ~TensorInfoCache() {
       tensorflow::mutex_lock ml(lock);
-#ifdef EXTRA_CACHE_LOGGING
-      tensorflow::mutex_lock ml2(global_cache_stats_lock);
-      LOG(INFO) << "Deleting TensorInfoCache, cache size = " << entries.size()
-                << ", total num_entries_outstanding = "
-                << total_num_entries_outstanding
-                << ", total cache size = " << total_num_entries_in_cache;
-#endif
-      CHECK_EQ(num_entries_outstanding, 0);
+      for (auto& p : entries) {
+        libxsmm_spmdm_destroy(&p.second->handle);
+      }
       entries.clear();
     }
 
    private:
     TF_DISALLOW_COPY_AND_ASSIGN(TensorInfoCache);
   };
-#else
-  // This structure contains a set of libxsmm kernels for sizes that have been
-  // encountered previously by this operator so that libxsmm does not need to
-  // reallocate its scratchpad memory each time (which hurts performance
-  // substantially).
-  struct TensorInfoCache {
-    struct TensorInfoCacheEntry {
-      // Parameters for kernel
-      int M;
-      int K;
-      int N;
-      int max_threads;
-      // libxsmm handle and matrix data
-      libxsmm_spmdm_handle handle;
-      libxsmm_CSR_sparseslice* output_csr;
-      // Chain to non-libxsmm implementation's cache in case that ever becomes
-      // useful (it is an empty struct right now)
-      typename SparseMatMul<TL, TR>::TensorInfoCache
-          non_libxsmm_cache;  // Currently not used
-    };
-    TensorInfoCache() {}
-    // Look up and remove first entry with these parameters, creating one if
-    // there isn't one
-    std::unique_ptr<TensorInfoCacheEntry> take_cache_entry(int M, int K, int N,
-                                                           int max_threads) {
-      std::unique_ptr<TensorInfoCacheEntry> e{
-          new TensorInfoCacheEntry{M, K, N, max_threads, {}, nullptr}};
-      libxsmm_spmdm_init(M, N, K, max_threads, &e->handle, &e->output_csr);
-      return e;
-    }
-    // Add a cache entry with certain parameters
-    void return_cache_entry(std::unique_ptr<TensorInfoCacheEntry> e) {
-      libxsmm_spmdm_destroy(&e->handle);
-    }
-
-   private:
-    TF_DISALLOW_COPY_AND_ASSIGN(TensorInfoCache);
-  };
-#endif
 
   // Perform matrix multiplication of "left" and "right", and store the result
   // in *"output".
@@ -1479,21 +1345,21 @@ inline void SparseMatMul<TL, TR>::ComputeBlockSizes(
 
 template <typename F>
 void do_on_all_threads(const DeviceBase::CpuWorkerThreads* thread_pool,
-                       ptrdiff_t max_thread_count, const F& f) {
+                       const F& f) {
   int num_threads = thread_pool->num_threads;
   if (num_threads == 0) {
     LOG(FATAL) << "Have 0 threads in thread pool";
   } else if (num_threads == 1) {
-    f(0, 1);
+    f(0);
   } else {
     BlockingCounter counter(num_threads - 1);
     for (int i = 1; i < num_threads; ++i) {
       thread_pool->workers->Schedule([&, i]() {
-        f(i, num_threads);
+        f(i);
         counter.DecrementCount();
       });
     }
-    f(0, num_threads);
+    f(0);
     counter.Wait();
   }
 }
@@ -1522,24 +1388,21 @@ void wrapper_libxsmm_spmdm_createSparseSlice_generic_thread(
 
 void wrapper_libxsmm_spmdm_compute_generic_thread(
     empty_type_wrapper<bfloat16>, const libxsmm_spmdm_handle* handle,
-    char transA, char transB, libxsmm_CSR_sparseslice* A_sparse,
-    const bfloat16* B, char transC, float* C, int block_id, int tid,
-    int nthreads) {
-  const uint16 alpha = 1;
-  const uint16 beta = 0;
+    char transA, char transB, const bfloat16* alpha,
+    libxsmm_CSR_sparseslice* A_sparse, const bfloat16* B, char transC,
+    const bfloat16* beta, float* C, int block_id, int tid, int nthreads) {
   return libxsmm_spmdm_compute_bfloat16_thread(
-      handle, transA, transB, &alpha, A_sparse,
-      reinterpret_cast<const uint16*>(B), transC, &beta, C, block_id, tid,
-      nthreads);
+      handle, transA, transB, reinterpret_cast<const uint16*>(alpha), A_sparse,
+      reinterpret_cast<const uint16*>(B), transC,
+      reinterpret_cast<const uint16*>(beta), C, block_id, tid, nthreads);
 }
 void wrapper_libxsmm_spmdm_compute_generic_thread(
     empty_type_wrapper<float>, const libxsmm_spmdm_handle* handle, char transA,
-    char transB, libxsmm_CSR_sparseslice* A_sparse, const float* B, char transC,
-    float* C, int block_id, int tid, int nthreads) {
-  const float alpha = 1.f;
-  const float beta = 0.f;
-  return libxsmm_spmdm_compute_fp32_thread(handle, transA, transB, &alpha,
-                                           A_sparse, B, transC, &beta, C,
+    char transB, const float* alpha, libxsmm_CSR_sparseslice* A_sparse,
+    const float* B, char transC, const float* beta, float* C, int block_id,
+    int tid, int nthreads) {
+  return libxsmm_spmdm_compute_fp32_thread(handle, transA, transB, alpha,
+                                           A_sparse, B, transC, beta, C,
                                            block_id, tid, nthreads);
 }
 
@@ -1590,13 +1453,11 @@ inline void LibxsmmSparseMatMul<TL, TR>::Compute(
   const int left_dim1 = transpose_left ? left.dimension(0) : left.dimension(1);
   const int right_dim0 = right.dimension(0);
   const int right_dim1 = right.dimension(1);
-  const int output_dim0 =
-      transpose_output ? output->dimension(1) : output->dimension(0);
-  const int output_dim1 =
-      transpose_output ? output->dimension(0) : output->dimension(1);
   CHECK_EQ(left_dim1, right_dim0);
-  CHECK_EQ(left_dim0, output_dim0);
-  CHECK_EQ(right_dim1, output_dim1);
+  CHECK_EQ(left_dim0,
+           (transpose_output ? output->dimension(1) : output->dimension(0)));
+  CHECK_EQ(right_dim1,
+           (transpose_output ? output->dimension(0) : output->dimension(1)));
   if (left_dim0 < 32 || left_dim1 < 32 || right_dim1 < 32) {
     // Causes problems in libxsmm
     SparseMatMul<TL, TR>::Compute(
@@ -1614,50 +1475,42 @@ inline void LibxsmmSparseMatMul<TL, TR>::Compute(
   // Convert the left matrix to compressed sparse row (CSR) format
   ptrdiff_t total_num_creation_blocks =
       libxsmm_spmdm_get_num_createSparseSlice_blocks(&entry->handle);
-  ptrdiff_t total_num_mult_blocks =
-      libxsmm_spmdm_get_num_compute_blocks(&entry->handle);
-  bool use_libxsmm =
-      !(total_num_creation_blocks + total_num_mult_blocks < num_threads &&
-        !transpose_left && !transpose_output);
-  if (!use_libxsmm) {
-    // Avoid some performance issues in libxsmm (FIXME)
-    cache->return_cache_entry(std::move(entry));
-    SparseMatMul<TL, TR>::Compute(
-        nullptr /* Assumes no cached data for fallback */, left, right,
-        transpose_left, thread_pool, transpose_output, output);
-    return;
-  }
   std::atomic<int> cur_create_block_number;
   cur_create_block_number.store(0);
-  do_on_all_threads(thread_pool, total_num_creation_blocks,
-                    [&](int i, int actual_num_threads) {
-                      PinnedToCurrentCPU pin;
-                      while (true) {
-                        int work_item = cur_create_block_number.fetch_add(1);
-                        if (work_item >= total_num_creation_blocks) break;
-                        wrapper_libxsmm_spmdm_createSparseSlice_generic_thread(
-                            empty_type_wrapper<TL>{}, &entry->handle,
-                            (transpose_left ? 'T' : 'N'), left_data,
-                            entry->output_csr, work_item, i,
-                            actual_num_threads);
-                      }
-                    });
+  do_on_all_threads(thread_pool, [&](int i) {
+    PinnedToCurrentCPU pin;
+    while (true) {
+      int work_item = cur_create_block_number.fetch_add(1);
+      if (work_item >= total_num_creation_blocks) break;
+      wrapper_libxsmm_spmdm_createSparseSlice_generic_thread(
+          empty_type_wrapper<TL>{}, &entry->handle,
+          (transpose_left ? 'T' : 'N'), left_data, entry->output_csr, work_item,
+          i, num_threads);
+    }
+  });
   // Do matrix-matrix multiplication
+  // TODO(jewillco): libxsmm doesn't support beta != 1 yet -- remove when
+  // release
+  // includes beta handling
+  memset(output_data, 0, left_dim0 * right_dim1 * sizeof(TR));
+  ptrdiff_t total_num_mult_blocks =
+      libxsmm_spmdm_get_num_compute_blocks(&entry->handle);
   std::atomic<int> cur_mult_block_number;
   cur_mult_block_number.store(0);
-  do_on_all_threads(
-      thread_pool, total_num_mult_blocks, [&](int i, int actual_num_threads) {
-        PinnedToCurrentCPU pin;
-        while (true) {
-          int work_item = cur_mult_block_number.fetch_add(1);
-          if (work_item >= total_num_mult_blocks) break;
-          wrapper_libxsmm_spmdm_compute_generic_thread(
-              empty_type_wrapper<TL>{}, &entry->handle,
-              (transpose_left ? 'T' : 'N'), 'N', entry->output_csr, right_data,
-              (transpose_output ? 'T' : 'N'), output_data, work_item, i,
-              actual_num_threads);
-        }
-      });
+  do_on_all_threads(thread_pool, [&](int i) {
+    PinnedToCurrentCPU pin;
+    while (true) {
+      int work_item = cur_mult_block_number.fetch_add(1);
+      if (work_item >= total_num_mult_blocks) break;
+      const TL alpha(1.0);  // Stored in a variable so we can get a pointer
+      const TL beta(0.0);   // Stored in a variable so we can get a pointer
+      wrapper_libxsmm_spmdm_compute_generic_thread(
+          empty_type_wrapper<TL>{}, &entry->handle,
+          (transpose_left ? 'T' : 'N'), 'N', &alpha, entry->output_csr,
+          right_data, (transpose_output ? 'T' : 'N'), &beta, output_data,
+          work_item, i, num_threads);
+    }
+  });
   // Put handle + CSR storage back into cache
   cache->return_cache_entry(std::move(entry));
 }
@@ -1803,17 +1656,15 @@ inline void SparseMatMul<TL, TR>::Compute(
                           SparseMatMulOp<TA, TB, LibxsmmSparseMatMul>);
 #endif
 
+REGISTER_SPARSE_MATMUL(bfloat16, bfloat16);
+
 REGISTER_SPARSE_MATMUL(float, bfloat16);
 
 REGISTER_SPARSE_MATMUL(bfloat16, float);
 
 #ifdef TENSORFLOW_USE_LIBXSMM
-REGISTER_SPARSE_MATMUL_LIBXSMM(bfloat16, bfloat16);
-
 REGISTER_SPARSE_MATMUL_LIBXSMM(float, float);
 #else
-REGISTER_SPARSE_MATMUL(bfloat16, bfloat16);
-
 REGISTER_SPARSE_MATMUL(float, float);
 #endif
 
diff --git a/tensorflow/core/kernels/sparse_matmul_op.h b/tensorflow/core/kernels/sparse_matmul_op.h
index bff6a0c9b3..61bd6593c3 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_matmul_op.h
@@ -255,13 +255,12 @@ EIGEN_STRONG_INLINE Packet8d pbroadcast_second<Packet8d>(const Packet8d& a_in) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pbroadcast_third<Packet8d>(const Packet8d& a_in) {
-  Packet2d a = _mm256_extractf128_pd(_mm512_castpd512_pd256(a_in), 1);
+  Packet2d a = _mm512_extractf32x4_ps(a_in, 1);
   return _mm512_broadcastsd_pd(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pbroadcast_fourth<Packet8d>(const Packet8d& a_in) {
-  Packet2d a =
-      _mm_permute_pd(_mm256_extractf128_pd(_mm512_castpd512_pd256(a_in), 1), 3);
+  Packet2d a = _mm_permute_pd(_mm512_extractf32x4_ps(a_in, 1), 3);
   return _mm512_broadcastsd_pd(a);
 }
 template <>
@@ -418,17 +417,14 @@ EIGEN_STRONG_INLINE Packet8f pbroadcast_fourth<Packet8f>(const Packet8f& a) {
 
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_l(const Packet16f& from) {
-  return _mm512_castsi512_ps(_mm512_slli_epi32(
-      _mm512_cvtepu16_epi32(_mm512_castsi512_si256(_mm512_castps_si512(from))),
-      16));
+  return _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm512_castsi512_si256(from)),
+                           16);
 }
 
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_u(const Packet16f& from) {
-  return _mm512_castsi512_ps(
-      _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_castpd_si256(
-                            _mm512_extractf64x4_pd(_mm512_castps_pd(from), 1))),
-                        16));
+  return _mm512_slli_epi32(
+      _mm512_cvtepu16_epi32(_mm512_extractf64x4_pd(from, 1)), 16);
 }
 
 #endif
diff --git a/tensorflow/core/kernels/xsmm_conv2d.cc b/tensorflow/core/kernels/xsmm_conv2d.cc
index 823cdf7e09..878abe9712 100644
--- a/tensorflow/core/kernels/xsmm_conv2d.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d.cc
@@ -26,14 +26,18 @@ void dummy_xsmm_conv2d_ensure_file_is_not_empty(void);
 #include "tensorflow/core/kernels/xsmm_conv2d.h"
 
 #include <stdlib.h>
+#include <cstring>
+#if 0
+#include <omp.h>
+#endif
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 
+#include "libxsmm_main.h"  // TODO(bsteiner): API to avoid incl. header from src/
 #include "include/libxsmm_cpuid.h"
-#include "libxsmm_dnn_handle.h"
-#include "libxsmm_malloc.h"
+#include "include/libxsmm_malloc.h"
 
 namespace tensorflow {
 
@@ -59,10 +63,6 @@ bool CanUseXsmmConv2D(const libxsmm_dnn_conv_desc& desc,
     VLOG(1) << "Cannot use XSMM convolutions: unsupported format!";
     return false;
   }
-  if (desc.pad_h_in != 0 || desc.pad_w_in != 0) {
-    VLOG(1) << "Cannot use XSMM convolutions: unsupported padding!";
-    return false;
-  }
   if (desc.K % VECTOR_SIZE != 0) {
     VLOG(1) << "Cannot use XSMM convolutions: output features count not"
                " divisible by vector size!";
@@ -72,7 +72,6 @@ bool CanUseXsmmConv2D(const libxsmm_dnn_conv_desc& desc,
   return true;
 }
 
-
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 namespace functor {
@@ -83,25 +82,34 @@ static void chk_libxsmm_err(libxsmm_dnn_err_t status, string msg) {
   }
 }
 
-LIBXSMM_INLINE void copy_RSCK_to_custom(const float* rsck, float *kcrs, int R, int S, int C, int K,int blocksifm, int blocksofm, int ifmblock,int ofmblock, int start, int end)
-{
-  LIBXSMM_VLA_DECL(4, const      float, input, rsck, S, C,K);
-  LIBXSMM_VLA_DECL(6, float, output, kcrs, blocksifm,R,S,ifmblock, ofmblock);
-  int r, s, k,c, v1,v2;
-  
-  for (k = start; k < end ; k++ ) { 
-    for(c = 0; c < blocksifm;c++){
-      for ( r = 0; r < R; r++ ) {
-        for ( s = 0; s < S; s++ ){
-          for ( v1 = c*ifmblock; v1 < std::min(C,(c+1)*ifmblock) ; v1++ ) {
-            for ( v2 = k*ofmblock; v2 < std::min(K, (k+1)*ofmblock); v2++ )
-              LIBXSMM_VLA_ACCESS(6,  output, k,c, r, s,v1- c*ifmblock,v2-k*ofmblock, blocksifm, R, S,ifmblock,ofmblock) = LIBXSMM_VLA_ACCESS(4, input, r, s, v1, v2,  S, C, K);
-            for ( v2 = K; v2 < (k+1)*ofmblock ; v2++ )
-              LIBXSMM_VLA_ACCESS(6,  output, k,c, r, s,v1- c*ifmblock,v2-k*ofmblock, blocksifm, R, S,ifmblock,ofmblock) = 0.0f; 
-            }
-          for ( v1 = C; v1 < (c+1)*ifmblock ; v1++ ) {
-            for ( v2 = k*ofmblock; v2 < (k+1)*ofmblock; v2++ )
-              LIBXSMM_VLA_ACCESS(6,  output, k,c, r, s,v1- c*ifmblock,v2-k*ofmblock, blocksifm, R, S,ifmblock,ofmblock) = 0.0f;
+LIBXSMM_INLINE void copy_RSCK_to_custom(const float* rsck, float* kcrs, int R,
+                                        int S, int C, int K, int blocksifm,
+                                        int blocksofm, int ifmblock,
+                                        int ofmblock, int start, int end) {
+  LIBXSMM_VLA_DECL(4, const float, input, rsck, S, C, K);
+  LIBXSMM_VLA_DECL(6, float, output, kcrs, blocksifm, R, S, ifmblock, ofmblock);
+  int r, s, k, c, v1, v2;
+
+  for (k = start; k < end; k++) {
+    for (c = 0; c < blocksifm; c++) {
+      for (r = 0; r < R; r++) {
+        for (s = 0; s < S; s++) {
+          for (v1 = c * ifmblock; v1 < std::min(C, (c + 1) * ifmblock); v1++) {
+            for (v2 = k * ofmblock; v2 < std::min(K, (k + 1) * ofmblock); v2++)
+              LIBXSMM_VLA_ACCESS(6, output, k, c, r, s, v1 - c * ifmblock,
+                                 v2 - k * ofmblock, blocksifm, R, S, ifmblock,
+                                 ofmblock) =
+                  LIBXSMM_VLA_ACCESS(4, input, r, s, v1, v2, S, C, K);
+            for (v2 = K; v2 < (k + 1) * ofmblock; v2++)
+              LIBXSMM_VLA_ACCESS(6, output, k, c, r, s, v1 - c * ifmblock,
+                                 v2 - k * ofmblock, blocksifm, R, S, ifmblock,
+                                 ofmblock) = 0.0f;
+          }
+          for (v1 = C; v1 < (c + 1) * ifmblock; v1++) {
+            for (v2 = k * ofmblock; v2 < (k + 1) * ofmblock; v2++)
+              LIBXSMM_VLA_ACCESS(6, output, k, c, r, s, v1 - c * ifmblock,
+                                 v2 - k * ofmblock, blocksifm, R, S, ifmblock,
+                                 ofmblock) = 0.0f;
           }
         }
       }
@@ -109,35 +117,28 @@ LIBXSMM_INLINE void copy_RSCK_to_custom(const float* rsck, float *kcrs, int R, i
   }
 }
 
- 
+class libxsmm_dnn_conv_desc_wrap {
+ public:
+  const libxsmm_dnn_conv_desc d;
 
-class libxsmm_dnn_conv_desc_wrap{
-  public:
-    const libxsmm_dnn_conv_desc d;
- 
-    libxsmm_dnn_conv_desc_wrap(const libxsmm_dnn_conv_desc &d_) : d(d_){
-    }
-    bool operator==(const libxsmm_dnn_conv_desc_wrap  &w) const{
-      return( d.N == w.d.N &&
-              d.C == w.d.C &&
-              d.H == w.d.H &&
-              d.W == w.d.W &&
-              d.K == w.d.K &&
-              d.R == w.d.R &&
-              d.S == w.d.S &&
-              d.u == w.d.u &&
-              d.v == w.d.v &&
-              d.pad_h_in == w.d.pad_h_in &&
-              d.pad_w_in == w.d.pad_w_in
-            );
-    }
+  libxsmm_dnn_conv_desc_wrap(const libxsmm_dnn_conv_desc& d_) : d(d_) {}
+  bool operator==(const libxsmm_dnn_conv_desc_wrap& w) const {
+    return (d.N == w.d.N && d.C == w.d.C && d.H == w.d.H && d.W == w.d.W &&
+            d.K == w.d.K && d.R == w.d.R && d.S == w.d.S && d.u == w.d.u &&
+            d.v == w.d.v && d.pad_h == w.d.pad_h && d.pad_w == w.d.pad_w);
+  }
 };
- 
- 
-struct HashFunction{
-  std::size_t operator()(const libxsmm_dnn_conv_desc_wrap & w) const{
+
+struct HashFunction {
+  std::size_t operator()(const libxsmm_dnn_conv_desc_wrap& w) const {
+    // unsigned char ptr[sizeof(&w.d)];
+
+    // memcpy(ptr, (unsigned char *)&w.d, sizeof(&w.d))
+
+    //
+    /*
     std::ostringstream N,C,H,W,K,R,S,u,v,padh,padw;
- 
+
     N << w.d.N; C << w.d.C;
     H << w.d.H; W << w.d.W;
     K << w.d.K; R << w.d.R;
@@ -152,59 +153,71 @@ struct HashFunction{
                        + S.str() + u.str()\
                        + v.str() + padh.str()\
                        + padw.str();
-
-    return ( std::hash<std::string>()(out_));
+    //
+    //
+    */
+    return (std::hash<unsigned long long>()((unsigned long long)&(w.d)));
   }
 };
 
-class handles{
-  public:
-    libxsmm_dnn_layer* find( const libxsmm_dnn_conv_desc_wrap &w) {
-      std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*,
-                         HashFunction>::iterator i = libxsmm_handles.find(w);
-      if (i == libxsmm_handles.end()){
-        libxsmm_dnn_err_t status;
-        libxsmm_dnn_layer* libxsmm_handle =
-            libxsmm_dnn_create_conv_layer(w.d, &status);
-        chk_libxsmm_err(status, "Create handle");
-        libxsmm_handles.insert(std::make_pair(w, libxsmm_handle));
-        return libxsmm_handle;
-      }
-      else
-        return i->second;
-    }
-   ~handles(){
-     std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*,
-                        HashFunction>::iterator i;
-    for (i= libxsmm_handles.begin(); i != libxsmm_handles.end(); i++)
+class handles {
+ public:
+  libxsmm_dnn_layer* find(const libxsmm_dnn_conv_desc_wrap& w) {
+    std::unordered_map<libxsmm_dnn_conv_desc_wrap, libxsmm_dnn_layer*,
+                       HashFunction>::iterator i = libxsmm_handles.find(w);
+    if (i == libxsmm_handles.end()) {
+      libxsmm_dnn_err_t status;
+      libxsmm_dnn_layer* libxsmm_handle =
+          libxsmm_dnn_create_conv_layer(w.d, &status);
+      chk_libxsmm_err(status, "Create handle");
+      libxsmm_handles.insert(std::make_pair(w, libxsmm_handle));
+      return libxsmm_handle;
+    } else
+      return i->second;
+  }
+  ~handles() {
+    std::unordered_map<libxsmm_dnn_conv_desc_wrap, libxsmm_dnn_layer*,
+                       HashFunction>::iterator i;
+    for (i = libxsmm_handles.begin(); i != libxsmm_handles.end(); i++)
       chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(i->second),
-                    "Destroy handle");
-    }
-  private:
-    std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*, HashFunction> libxsmm_handles;
+                      "Destroy handle");
+  }
+
+ private:
+  std::unordered_map<libxsmm_dnn_conv_desc_wrap, libxsmm_dnn_layer*,
+                     HashFunction>
+      libxsmm_handles;
 };
 
 static handles libxsmm_handles;
 
+//#define LIBXSMM_DETAILED_TIMING
+
 template <typename InputPtr, typename FilterPtr, typename OutputPtr>
 static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
                                    const libxsmm_dnn_conv_desc& desc,
-                                   libxsmm_dnn_compute_kind kind, InputPtr input,
-                                   FilterPtr filter, OutputPtr output) {
+                                   libxsmm_dnn_compute_kind kind,
+                                   InputPtr input, FilterPtr filter,
+                                   OutputPtr output) {
+#if defined(LIBXSMM_DETAILED_TIMING)
+  unsigned long long l_tick1, l_tick2, l_tick3, l_tick4, l_tick5, l_tick6,
+      l_tick7, l_tick8, l_tick9, l_tick10;
+  l_tick1 = libxsmm_timer_tick();
+#endif
   // setup scoped allocator, which adopts the allocator from the context
   const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator(*ctx);
   libxsmm_dnn_err_t status;
   libxsmm_dnn_layer* libxsmm_handle;
   libxsmm_dnn_conv_desc_wrap w(desc);
   void* scratch;
- 
-  if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD)
-    libxsmm_handle = libxsmm_handles.find(w);
-  else {
-    libxsmm_handle = libxsmm_dnn_create_conv_layer(desc, &status);
-    chk_libxsmm_err(status, "Create handle");
-  }
-  
+
+  // if(kind == LIBXSMM_DNN_COMPUTE_KIND_FWD)
+  libxsmm_handle = libxsmm_handles.find(w);
+  // else{
+  //  libxsmm_handle = libxsmm_dnn_create_conv_layer(desc, &status);
+  //  chk_libxsmm_err(status, "Create handle");
+  //}
+
   status = libxsmm_dnn_get_codegen_success(libxsmm_handle, kind);
   if (status == LIBXSMM_DNN_WARN_FALLBACK) {
     chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
@@ -217,100 +230,168 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
   libxsmm_dnn_buffer* libxsmm_output;
   libxsmm_dnn_filter* libxsmm_filter;
 
- /*
-  const DeviceBase::CpuWorkerThreads* worker_threads =
-      ctx->device()->tensorflow_cpu_worker_threads();
-
-  int num_threads = worker_threads->num_threads;
-*/
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick2 = libxsmm_timer_tick();
+#endif
 
   int ifmblock = (libxsmm_handle->ifmblock);
   int ofmblock = (libxsmm_handle->ofmblock);
 
-  int blocksifm = desc.C%ifmblock ==0 ? desc.C/ifmblock :desc.C/ifmblock + 1;
-  int blocksofm = desc.K%ofmblock ==0 ? desc.K/ofmblock :desc.K/ofmblock + 1;
-  float *native_filter = (float*)libxsmm_aligned_scratch(
-      blocksofm*blocksifm*desc.R*desc.S*ifmblock*ofmblock*sizeof(float),
-      2097152);
+  int blocksifm =
+      desc.C % ifmblock == 0 ? desc.C / ifmblock : desc.C / ifmblock + 1;
+  int blocksofm =
+      desc.K % ofmblock == 0 ? desc.K / ofmblock : desc.K / ofmblock + 1;
+  float* native_filter =
+      (float*)libxsmm_aligned_scratch(blocksofm * blocksifm * desc.R * desc.S *
+                                          ifmblock * ofmblock * sizeof(float),
+                                      2097152);
 
   const DeviceBase::CpuWorkerThreads* worker_threads =
       ctx->device()->tensorflow_cpu_worker_threads();
 
   int num_threads = worker_threads->num_threads;
 
-
-  if(blocksofm > num_threads){
-    int work = blocksofm;
-    BlockingCounter count(num_threads);
-    for (int i = 0; i < num_threads; ++i) {
+#if 1
+  if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD ||
+      kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
+    if (blocksofm > num_threads) {
+      int work = blocksofm;
+      BlockingCounter count(num_threads);
+      for (int i = 0; i < num_threads; ++i) {
         worker_threads->workers->Schedule([=, &count]() {
-        int start = work/num_threads*i;
-        int end =  (start + work/num_threads) > work ? work: start + work/num_threads;  
-        copy_RSCK_to_custom(filter, native_filter, desc.R, desc.S,desc.C, desc.K,blocksifm,blocksofm,ifmblock,ofmblock,start, end);
-        count.DecrementCount();
+          int start = work / num_threads * i;
+          int end = (start + work / num_threads) > work
+                        ? work
+                        : start + work / num_threads;
+          copy_RSCK_to_custom(filter, native_filter, desc.R, desc.S, desc.C,
+                              desc.K, blocksifm, blocksofm, ifmblock, ofmblock,
+                              start, end);
+          count.DecrementCount();
         });
-    }
-    count.Wait();
-  }
-  else{
+      }
+      count.Wait();
+    } else {
+      int work = blocksofm;
+      int num_threads = work;
 
-    int work = blocksofm;
-    int num_threads = work;
-    
-    BlockingCounter count(num_threads);
-    for (int i = 0; i < num_threads; ++i) {
+      BlockingCounter count(num_threads);
+      for (int i = 0; i < num_threads; ++i) {
         worker_threads->workers->Schedule([=, &count]() {
-        int start = i;
-        int end =  i+1;
-        copy_RSCK_to_custom(filter, native_filter, desc.R, desc.S,desc.C, desc.K,blocksifm,blocksofm,ifmblock,ofmblock, start, end);
-        count.DecrementCount();
+          int start = i;
+          int end = i + 1;
+          copy_RSCK_to_custom(filter, native_filter, desc.R, desc.S, desc.C,
+                              desc.K, blocksifm, blocksofm, ifmblock, ofmblock,
+                              start, end);
+          count.DecrementCount();
         });
+      }
+      count.Wait();
     }
-    count.Wait();
   }
-
-  libxsmm_input = libxsmm_dnn_link_buffer(
-      libxsmm_handle, LIBXSMM_DNN_INPUT, input, LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
+  // Added: for weight update
+  else if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
+    libxsmm_filter =
+        libxsmm_dnn_link_filter(libxsmm_handle, LIBXSMM_DNN_FILTER, filter,
+                                LIBXSMM_DNN_TENSOR_FORMAT_RSCK_PTR, &status);
+    chk_libxsmm_err(status,
+                    "Link filter");  // weight update is in RSCK as
+                                     // filter should be returned in RSCK
+                                     // format
+  }
+#else
+  memset(native_filter, 0,
+         blocksofm * blocksifm * desc.R * desc.S * ifmblock * ofmblock *
+             sizeof(float));
+#endif
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick3 = libxsmm_timer_tick();
+#endif
+
+  libxsmm_input =
+      libxsmm_dnn_link_buffer(libxsmm_handle, LIBXSMM_DNN_INPUT, input,
+                              LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
   chk_libxsmm_err(status, "Link input buffer");
-  libxsmm_output = libxsmm_dnn_link_buffer(
-      libxsmm_handle, LIBXSMM_DNN_OUTPUT, output, LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
+  libxsmm_output =
+      libxsmm_dnn_link_buffer(libxsmm_handle, LIBXSMM_DNN_OUTPUT, output,
+                              LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
   chk_libxsmm_err(status, "Link output buffer");
-  libxsmm_filter = libxsmm_dnn_link_filter(
-      libxsmm_handle, LIBXSMM_DNN_FILTER, native_filter, LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM_PTR, &status);
-  chk_libxsmm_err(status, "Link filter");
-
-  chk_libxsmm_err(libxsmm_dnn_zero_buffer(libxsmm_output), "Zero output");
-
-
+  if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD ||
+      kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
+    libxsmm_filter = libxsmm_dnn_link_filter(
+        libxsmm_handle, LIBXSMM_DNN_FILTER, native_filter,
+        LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM_PTR, &status);
+    chk_libxsmm_err(status, "Link filter");
+  }
   if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD) {
-    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input, LIBXSMM_DNN_REGULAR_INPUT),
+    chk_libxsmm_err(libxsmm_dnn_zero_buffer(libxsmm_output), "Zero output");
+
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input,
+                                            LIBXSMM_DNN_REGULAR_INPUT),
                     "Bind input forward");
-    chk_libxsmm_err(
-        libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output, LIBXSMM_DNN_REGULAR_OUTPUT),
-        "Bind output forward");
-    chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_REGULAR_FILTER),
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output,
+                                            LIBXSMM_DNN_REGULAR_OUTPUT),
+                    "Bind output forward");
+    chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter,
+                                            LIBXSMM_DNN_REGULAR_FILTER),
                     "Bind filter forward");
-  } else {
-    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input, LIBXSMM_DNN_GRADIENT_INPUT),
+  } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
+    chk_libxsmm_err(libxsmm_dnn_zero_buffer(libxsmm_input), "Zero input");
+
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input,
+                                            LIBXSMM_DNN_GRADIENT_INPUT),
                     "Bind input backward");
-    chk_libxsmm_err(
-        libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output, LIBXSMM_DNN_GRADIENT_OUTPUT),
-        "Bind output backward");
-    chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_REGULAR_FILTER),
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output,
+                                            LIBXSMM_DNN_GRADIENT_OUTPUT),
+                    "Bind output backward");
+    chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter,
+                                            LIBXSMM_DNN_REGULAR_FILTER),
                     "Bind filter backward");
+  } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
+    chk_libxsmm_err(libxsmm_dnn_zero_filter(libxsmm_filter), "Zero filter");
+
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input,
+                                            LIBXSMM_DNN_REGULAR_INPUT),
+                    "Bind input weight udpate");
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output,
+                                            LIBXSMM_DNN_GRADIENT_OUTPUT),
+                    "Bind output weight update");
+    chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter,
+                                            LIBXSMM_DNN_GRADIENT_FILTER),
+                    "Bind filter weight update");
+  } else {
+    /* shouldn't happen */
   }
 
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick4 = libxsmm_timer_tick();
+#endif
+
   /* bind scratch */
-  scratch = (void*)libxsmm_aligned_scratch( libxsmm_dnn_get_scratch_size( libxsmm_handle, kind, &status ), 2097152);
-  chk_libxsmm_err( status, "scratch allocation" );
-  chk_libxsmm_err( libxsmm_dnn_bind_scratch( libxsmm_handle, kind, scratch ), "binding scratch" );
+  scratch = (void*)libxsmm_aligned_scratch(
+      libxsmm_dnn_get_scratch_size(libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL,
+                                   &status),
+      2097152);
+  chk_libxsmm_err(status, "scratch allocation");
+  chk_libxsmm_err(libxsmm_dnn_bind_scratch(
+                      libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch),
+                  "binding scratch");
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick5 = libxsmm_timer_tick();
+#endif
 
   if (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
     libxsmm_dnn_transpose_filter(libxsmm_handle, LIBXSMM_DNN_FILTER);
   }
 
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick6 = libxsmm_timer_tick();
+#endif
+
+#if 1
   BlockingCounter counter(num_threads);
-  
+
   for (int i = 0; i < num_threads; ++i) {
     worker_threads->workers->Schedule([=, &counter]() {
       chk_libxsmm_err(libxsmm_dnn_execute_st(libxsmm_handle, kind, 0, i),
@@ -319,28 +400,97 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
     });
   }
   counter.Wait();
+#else
+#pragma omp parallel
+  {
+    chk_libxsmm_err(
+        libxsmm_dnn_execute_st(libxsmm_handle, kind, 0, omp_get_thread_num()),
+        "Worker");
+  }
+#endif
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick7 = libxsmm_timer_tick();
+#endif
+
+  if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
+    libxsmm_dnn_reduce_wu_filters(libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER);
+  }
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick8 = libxsmm_timer_tick();
+#endif
 
   /* clean up */
-  chk_libxsmm_err( libxsmm_dnn_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ), "release scratch" );
+  chk_libxsmm_err(
+      libxsmm_dnn_release_scratch(libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL),
+      "release scratch");
   if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD) {
-    chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ), "release input" );
-    chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT ), "release output" );
-    chk_libxsmm_err( libxsmm_dnn_release_filter( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER ), "release filter" );
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT),
+        "release input");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT),
+        "release output");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_filter(libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER),
+        "release filter");
+  } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT),
+        "release input");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT),
+        "release output");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_filter(libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER),
+        "release filter");
+  } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT),
+        "release input");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT),
+        "release output");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_filter(libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER),
+        "release filter");
   } else {
-    chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT ), "release input" );
-    chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT ), "release output" );
-    chk_libxsmm_err( libxsmm_dnn_release_filter( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER ), "release filter" );
+    /* shouldn't happen */
   }
   chk_libxsmm_err(libxsmm_dnn_destroy_buffer(libxsmm_input), "Destroy input");
   chk_libxsmm_err(libxsmm_dnn_destroy_buffer(libxsmm_output), "Destroy output");
   chk_libxsmm_err(libxsmm_dnn_destroy_filter(libxsmm_filter), "Destroy filter");
-  
-  if(kind != LIBXSMM_DNN_COMPUTE_KIND_FWD)
-    chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
-                  "Destroy handle");
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick9 = libxsmm_timer_tick();
+#endif
+
+  // if(kind != LIBXSMM_DNN_COMPUTE_KIND_FWD)
+  // chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
+  //               "Destroy handle");
 
   libxsmm_free(native_filter);
   libxsmm_free(scratch);
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick10 = libxsmm_timer_tick();
+  printf(
+      "time for convolution (%i, %i, %i, %i, %i): %f, %f, %f, %f, %f, %f, %f, "
+      "%f, %f, %f\n",
+      desc.N, desc.C, desc.K, desc.R, desc.S,
+      libxsmm_timer_duration(l_tick1, l_tick2),
+      libxsmm_timer_duration(l_tick2, l_tick3),
+      libxsmm_timer_duration(l_tick3, l_tick4),
+      libxsmm_timer_duration(l_tick4, l_tick5),
+      libxsmm_timer_duration(l_tick5, l_tick6),
+      libxsmm_timer_duration(l_tick6, l_tick7),
+      libxsmm_timer_duration(l_tick7, l_tick8),
+      libxsmm_timer_duration(l_tick8, l_tick9),
+      libxsmm_timer_duration(l_tick9, l_tick10),
+      libxsmm_timer_duration(l_tick1, l_tick10));
+#endif
+
   return true;  // Succeeded
 }
 
@@ -348,8 +498,8 @@ template <typename T>
 struct XsmmFwdConv2D<CPUDevice, T> {
   bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
                   const T* input, const T* filter, T* output) {
-    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_FWD, input,
-                                  filter, output);
+    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_FWD,
+                                  input, filter, output);
   }
 };
 
@@ -357,8 +507,8 @@ template <typename T>
 struct XsmmBkwInputConv2D<CPUDevice, T> {
   bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
                   T* input, const T* filter, const T* output) {
-    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_BWD, input,
-                                  filter, output);
+    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_BWD,
+                                  input, filter, output);
   }
 };
 
@@ -366,8 +516,8 @@ template <typename T>
 struct XsmmBkwFilterConv2D<CPUDevice, T> {
   bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
                   const T* input, T* filter, const T* output) {
-    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_UPD, input,
-                                  filter, output);
+    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_UPD,
+                                  input, filter, output);
   }
 };
 
diff --git a/tensorflow/core/lib/io/inputbuffer.cc b/tensorflow/core/lib/io/inputbuffer.cc
index 9cff1d349e..7efe2dc543 100644
--- a/tensorflow/core/lib/io/inputbuffer.cc
+++ b/tensorflow/core/lib/io/inputbuffer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/io/inputbuffer.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace io {
@@ -43,25 +44,26 @@ Status InputBuffer::FillBuffer() {
 
 Status InputBuffer::ReadLine(string* result) {
   result->clear();
-  int i;
   Status s;
-  for (i = 0;; i++) {
-    if (pos_ == limit_) {
-      // Get more data into buffer
-      s = FillBuffer();
-      if (limit_ == buf_) {
-        break;
+  do {
+    size_t buf_remain = limit_ - pos_;
+    char* newline = static_cast<char*>(memchr(pos_, '\n', buf_remain));
+    if (newline != nullptr) {
+      size_t result_len = newline - pos_;
+      result->append(pos_, result_len);
+      pos_ = newline + 1;
+      if (!result->empty() && result->back() == '\r') {
+        result->resize(result->size() - 1);
       }
-    }
-    char c = *pos_++;
-    if (c == '\n') {
-      // We don't append the '\n' to *result
       return Status::OK();
     }
-    // We don't append '\r' to *result
-    if (c != '\r') {
-      *result += c;
-    }
+    if (buf_remain > 0) result->append(pos_, buf_remain);
+    // Get more data into buffer
+    s = FillBuffer();
+    DCHECK_EQ(pos_, buf_);
+  } while (limit_ != buf_);
+  if (!result->empty() && result->back() == '\r') {
+    result->resize(result->size() - 1);
   }
   if (errors::IsOutOfRange(s) && !result->empty()) {
     return Status::OK();
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 6930af48a7..e81490c498 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -1323,6 +1323,11 @@ Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 If `indices` is a permutation and `len(indices) == params.shape[0]` then
 this operation will permute `params` accordingly.
 
+`validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
+`indices` are always validated to be within range. If assigned to GPU,
+out-of-bound indices result in unspecified behavior (currently the result is
+`0`, but this may become an error in the future).
+
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="../../images/Gather.png" alt>
 </div>
diff --git a/tensorflow/core/ops/nn_grad.cc b/tensorflow/core/ops/nn_grad.cc
index e3b876b240..05ad635f58 100644
--- a/tensorflow/core/ops/nn_grad.cc
+++ b/tensorflow/core/ops/nn_grad.cc
@@ -181,4 +181,35 @@ Status MaxPoolGrad(const AttrSlice& attrs, FunctionDef* g) {
 }
 REGISTER_OP_GRADIENT("MaxPool", MaxPoolGrad);
 
+Status MaxPoolGradGrad(const AttrSlice& attrs, FunctionDef* g) {
+  // clang-format off
+  *g = FDH::Define(
+    // Arg defs
+    {"input: T", "grad: T"},
+    // Ret val defs
+    {"output: T"},
+    // Attr defs
+    {"T: {float, half} = DT_FLOAT",
+     "ksize: list(int) >= 4",
+     "strides: list(int) >= 4",
+     GetPaddingAttrString()},
+    // Nodes
+    {
+      // Invoke MaxPool again to recompute the outputs (removed by CSE?).
+      {{"maxpool"}, "MaxPool", {"input"},
+       /*Attrs=*/{{"T", "$T"},
+                  {"ksize", "$ksize"},
+                  {"strides", "$strides"},
+                  {"padding", "$padding"}}},
+      {{"output"}, "MaxPoolGradGrad", {"input", "maxpool", "grad"},
+       /*Attrs=*/{{"T", "$T"},
+                  {"ksize", "$ksize"},
+                  {"strides", "$strides"},
+                  {"padding", "$padding"}}}
+    });
+  // clang-format on
+  return Status::OK();
+}
+REGISTER_OP_GRADIENT("MaxPoolGrad", MaxPoolGradGrad);
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index e56b27b0c0..e9d5897af0 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -89,7 +89,7 @@ REGISTER_OP("AvgPool")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
-    .Attr("T: {float, half, double}")
+    .Attr("T: realnumbertype")
     .SetShapeFn(shape_inference::AvgPoolShape)
     .Doc(R"doc(
 Performs average pooling on the input.
@@ -117,7 +117,7 @@ REGISTER_OP("AvgPoolGrad")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
-    .Attr("T: {float, half, double}")
+    .Attr("T: realnumbertype")
     .SetShapeFn([](InferenceContext* c) {
       // NOTE(mrry): We could in principle work out the shape from the
       // gradients and the attrs, but if we do not know orig_input_shape
@@ -1186,15 +1186,16 @@ data_format: The data format of the input and output data. With the
 )doc");
 
 REGISTER_OP("MaxPool3DGrad")
-    .Input("orig_input: float")
-    .Input("orig_output: float")
+    .Input("orig_input: TInput")
+    .Input("orig_output: TInput")
     .Input("grad: T")
     .Output("output: T")
     .Attr("ksize: list(int) >= 5 ")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: numbertype")
+    .Attr("T: numbertype = DT_FLOAT")
+    .Attr("TInput: numbertype = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 5);
     })
@@ -1216,6 +1217,44 @@ data_format: The data format of the input and output data. With the
         [batch, in_channels, in_depth, in_height, in_width].
 )doc");
 
+REGISTER_OP("MaxPool3DGradGrad")
+    .Input("orig_input: T")
+    .Input("orig_output: T")
+    .Input("grad: T")
+    .Output("output: T")
+    .Attr("ksize: list(int) >= 5 ")
+    .Attr("strides: list(int) >= 5")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("T: realnumbertype")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Pool3DShape(c));
+      ShapeHandle unused;
+      // Validate 'orig_input' is the same shape as 'grad'
+      TF_RETURN_IF_ERROR(c->Merge(c->input(0), c->input(2), &unused));
+      // Validate 'orig_output' is same shape as 'output'
+      TF_RETURN_IF_ERROR(c->Merge(c->input(1), c->output(0), &unused));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Computes second-order gradients of the maxpooling function.
+
+ksize: 1-D tensor of length 5. The size of the window for each dimension of
+  the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+strides: 1-D tensor of length 5. The stride of the sliding window for each
+  dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+padding: The type of padding algorithm to use.
+orig_input: The original input tensor.
+orig_output: The original output tensor.
+grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+output: Gradients of gradients w.r.t. the input to `max_pool`.
+data_format: The data format of the input and output data. With the
+    default format "NDHWC", the data is stored in the order of:
+        [batch, in_depth, in_height, in_width, in_channels].
+    Alternatively, the format could be "NCDHW", the data storage order is:
+        [batch, in_channels, in_depth, in_height, in_width].
+)doc");
+
 // --------------------------------------------------------------------------
 
 REGISTER_OP("L2Loss")
@@ -1303,7 +1342,7 @@ output: The gradients for LRN.
 // --------------------------------------------------------------------------
 
 REGISTER_OP("MaxPool")
-    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("T: realnumbertype = DT_FLOAT")
     .Attr("ksize: list(int) >= 4")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
@@ -1336,7 +1375,7 @@ REGISTER_OP("MaxPoolGrad")
     .Input("orig_output: T")
     .Input("grad: T")
     .Output("output: T")
-    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("T: realnumbertype = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 4);
     })
@@ -1358,6 +1397,43 @@ grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
 output: Gradients w.r.t. the input to `max_pool`.
 )doc");
 
+REGISTER_OP("MaxPoolGradGrad")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Input("orig_input: T")
+    .Input("orig_output: T")
+    .Input("grad: T")
+    .Output("output: T")
+    .Attr("T: realnumbertype")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::MaxPoolShape(c));
+      ShapeHandle unused;
+      // Validate 'orig_input' is the same shape as 'grad'
+      TF_RETURN_IF_ERROR(c->Merge(c->input(0), c->input(2), &unused));
+      // Validate 'orig_output' is same shape as 'output'
+      TF_RETURN_IF_ERROR(c->Merge(c->input(1), c->output(0), &unused));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Computes second-order gradients of the maxpooling function.
+
+ksize: The size of the window for each dimension of the input tensor.
+strides: The stride of the sliding window for each dimension of the
+  input tensor.
+padding: The type of padding algorithm to use.
+data_format: Specify the data format of the input and output data. With the
+    default format "NHWC", the data is stored in the order of:
+        [batch, in_height, in_width, in_channels].
+    Alternatively, the format could be "NCHW", the data storage order of:
+        [batch, in_channels, in_height, in_width].
+orig_input: The original input tensor.
+orig_output: The original output tensor.
+grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+output: Gradients of gradients w.r.t. the input to `max_pool`.
+)doc");
+
 REGISTER_OP("MaxPoolWithArgmax")
     .Attr("ksize: list(int) >= 4")
     .Attr("strides: list(int) >= 4")
@@ -1366,7 +1442,7 @@ REGISTER_OP("MaxPoolWithArgmax")
     .Input("input: T")
     .Output("output: T")
     .Output("argmax: Targmax")
-    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("T: realnumbertype")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::MaxPoolShape(c));
       c->set_output(1, c->output(0));
@@ -1397,7 +1473,7 @@ REGISTER_OP("MaxPoolGradWithArgmax")
     .Input("grad: T")
     .Input("argmax: Targmax")
     .Output("output: T")
-    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("T: realnumbertype")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 4);
     })
@@ -1415,6 +1491,39 @@ argmax: The indices of the maximum values chosen for each output of `max_pool`.
 output: Gradients w.r.t. the input of `max_pool`.
 )doc");
 
+REGISTER_OP("MaxPoolGradGradWithArgmax")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .Attr("Targmax: {int32, int64}")
+    .Input("input: T")
+    .Input("grad: T")
+    .Input("argmax: Targmax")
+    .Output("output: T")
+    .Attr("T: realnumbertype")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::MaxPoolShape(c));
+      ShapeHandle unused;
+      // Validate 'orig_input' is the same shape as 'grad'
+      TF_RETURN_IF_ERROR(c->Merge(c->input(0), c->input(1), &unused));
+      // Validate 'argmax' is same shape as 'output'
+      TF_RETURN_IF_ERROR(c->Merge(c->input(2), c->output(0), &unused));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Computes second-order gradients of the maxpooling function.
+
+ksize: The size of the window for each dimension of the input tensor.
+strides: The stride of the sliding window for each dimension of the
+  input tensor.
+padding: The type of padding algorithm to use.
+input: The original input.
+grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+  input of `max_pool`.
+argmax: The indices of the maximum values chosen for each output of `max_pool`.
+output: Gradients of gradients w.r.t. the input of `max_pool`.
+)doc");
+
 // --------------------------------------------------------------------------
 
 REGISTER_OP("Dilation2D")
@@ -2517,7 +2626,10 @@ REGISTER_OP("MklConv2D")
     .Attr(GetConvnetDataFormatAttrString())
     .SetShapeFn(shape_inference::Conv2DShape)
     .Doc(R"doc(
-MKL version of Conv2D
+MKL version of Conv2D operator. Uses MKL DNN APIs to perform 2D convolution.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
 )doc");
 
 REGISTER_OP("MklConv2DWithBias")
@@ -2533,14 +2645,216 @@ REGISTER_OP("MklConv2DWithBias")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
-    .Attr(GetConvnetDataFormatAttrString());
+    .Attr(GetConvnetDataFormatAttrString())
+    .Doc(R"doc(
+MKL version of Conv2D and BiasAdd operator. Uses MKL DNN APIs to perform
+2D convolution and add Bias to the output of convolution.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("MklConv2DBackpropFilter")
+    .Input("input: T")
+    .Input("mkl_input: uint8")
+    .Input("filter_sizes: int32")
+    .Input("mkl_filter_size: uint8")
+    .Input("out_backprop: T")
+    .Input("mkl_out_backprop: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      return InputTensorShapeOrUnknown(c, 2 /* input_idx */, 4 /* ndims */);
+    })
+    .Doc(R"doc(
+MKL version of Conv2DBackpropFilter. Uses MKL DNN APIs to compute the
+gradients of convolution with respect to the filter.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("MklConv2DWithBiasBackpropBias")
+    .Input("out_backprop: T")
+    .Input("mkl_out_backprop: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr(GetConvnetDataFormatAttrString())
+    .Doc(R"doc(
+MKL version of Conv2DBackpropBias. Uses MKL DNN APIs to compute the
+gradients of convolution with respect to the bias.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("MklConv2DBackpropInput")
+    .Input("input_sizes: int32")
+    .Input("mkl_input_sizes: uint8")
+    .Input("filter: T")
+    .Input("mkl_filter: uint8")
+    .Input("out_backprop: T")
+    .Input("mkl_out_backprop: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      return InputTensorShapeOrUnknown(c, 0 /* input_idx */, 4 /* ndims */);
+    })
+    .Doc(R"doc(
+MKL version of Convolution2D backward input. Uses MKL DNN APIs to compute the
+gradients of convolution with respect to the input.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("MklRelu")
+    .Input("features: T")
+    .Input("mkl_features: uint8")
+    .Output("activations: T")
+    .Output("mkl_activations: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+MKL version of Relu operator. Uses MKL DNN APIs to implement Relu operator.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("MklReluGrad")
+    .Input("gradients: T")
+    .Input("mkl_gradients: uint8")
+    .Input("features: T")
+    .Input("mkl_features: uint8")
+    .Output("backprops: T")
+    .Output("mkl_backprops: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
+    .Doc(R"doc(
+MKL version of ReluGrad operator. Uses MKL DNN APIs to compute rectified
+linear gradients for Relu operation.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("MklMaxPool")
+    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("workspace_enabled: bool = false")
+    .Input("input: T")
+    .Input("mkl_input: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Output("workspace: T")
+    .Output("mkl_workspace: uint8")
+    .SetShapeFn(shape_inference::MaxPoolShape)
+    .Doc(R"doc(
+MKL version of MaxPool operator. Uses MKL DNN APIs to perform max pooling
+on the input.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("MklMaxPoolGrad")
+    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr("workspace_enabled: bool = false")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Input("orig_input: T")
+    .Input("mkl_orig_input: uint8")
+    .Input("orig_output: T")
+    .Input("mkl_orig_output: uint8")
+    .Input("grad: T")
+    .Input("mkl_grad: uint8")
+    .Input("workspace: T")
+    .Input("mkl_workspace: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .SetShapeFn([](InferenceContext* c) {
+      return UnchangedShapeWithRank(c, 4);
+    })
+    .Doc(R"doc(
+MKL version of MaxPoolGrad. Uses MKL DNN APIs to compute gradients of
+MaxPool operator.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("MklAvgPool")
+    .Input("value: T")
+    .Input("mkl_input: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("T: {float, half, double}")
+    .SetShapeFn(shape_inference::AvgPoolShape)
+    .Doc(R"doc(
+MKL version of AvgPool operator. Uses MKL DNN APIs to perform average pooling
+on the input.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("MklAvgPoolGrad")
+    .Input("orig_input_shape: int32")
+    .Input("mkl_orig_input: uint8")
+    .Input("grad: T")
+    .Input("mkl_grad: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("T: {float, half, double}")
+    .SetShapeFn([](InferenceContext* c) {
+      return InputTensorShapeOrUnknown(c, 0 /* input_idx */, 4 /* ndims */);
+    })
+    .Doc(R"doc(
+MKL version of AvgPoolGrad operator. Uses MKL DNN APIs to compute gradients
+of AvgPool function.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
 
 REGISTER_OP("MklToTf")
     .Input("input: T")
     .Input("mkl_input: uint8")
     .Output("output: T")
     .Attr("T: {half, float, double}")
-    .Attr(GetConvnetDataFormatAttrString());
+    .Attr(GetConvnetDataFormatAttrString())
+    .Doc(R"doc(
+MKL operator to convert a tensor from MKL layout to TensorFlow layout.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
 #endif  // INTEL_MKL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 8cea2b239e..d1f9bbb391 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -25843,6 +25843,59 @@ op {
   description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n`(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such\nthat `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\nrange of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/UnsortedSegmentSum.png\" alt>\n</div>"
 }
 op {
+  name: "UnsortedSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    description: "A tensor whose shape is a prefix of `data.shape`."
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    description: "Has same shape as data, except for the first `segment_ids.rank`\ndimensions, which are replaced with a single dimension which has size\n`num_segments`."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Computes the max along segments of a tensor."
+  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n\\\\(output_i = \\sum_j data_j\\\\) where sum is over `j` such\nthat `segment_ids[j] == i`. Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\n  range of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/UnsortedSegmentSum.png\" alt>\n</div>"
+}
+op {
   name: "Unstage"
   output_arg {
     name: "values"
diff --git a/tensorflow/core/platform/cpu_info.cc b/tensorflow/core/platform/cpu_info.cc
index 5aa8c66a0b..906826e6f8 100644
--- a/tensorflow/core/platform/cpu_info.cc
+++ b/tensorflow/core/platform/cpu_info.cc
@@ -67,11 +67,8 @@ int GetXCR0EAX() {
 #endif
 
 // Structure for basic CPUID info
-struct CPUIDInfo {
-  string vendor_str;
-  int family;
-  int model_num;
-
+class CPUIDInfo {
+ public:
   CPUIDInfo()
       : have_adx_(0),
         have_aes_(0),
@@ -120,9 +117,9 @@ struct CPUIDInfo {
 
     // Get vendor string (issue CPUID with eax = 0)
     GETCPUID(eax, ebx, ecx, edx, 0, 0);
-    cpuid->vendor_str.append(reinterpret_cast<char *>(&ebx), 4);
-    cpuid->vendor_str.append(reinterpret_cast<char *>(&edx), 4);
-    cpuid->vendor_str.append(reinterpret_cast<char *>(&ecx), 4);
+    cpuid->vendor_str_.append(reinterpret_cast<char *>(&ebx), 4);
+    cpuid->vendor_str_.append(reinterpret_cast<char *>(&edx), 4);
+    cpuid->vendor_str_.append(reinterpret_cast<char *>(&ecx), 4);
 
     // To get general information and extended features we send eax = 1 and
     // ecx = 0 to cpuid.  The response is returned in eax, ebx, ecx and edx.
@@ -130,8 +127,8 @@ struct CPUIDInfo {
     // Volume 2A: Instruction Set Reference, A-M CPUID).
     GETCPUID(eax, ebx, ecx, edx, 1, 0);
 
-    cpuid->model_num = static_cast<int>((eax >> 4) & 0xf);
-    cpuid->family = static_cast<int>((eax >> 8) & 0xf);
+    cpuid->model_num_ = static_cast<int>((eax >> 4) & 0xf);
+    cpuid->family_ = static_cast<int>((eax >> 8) & 0xf);
 
     cpuid->have_aes_ = (ecx >> 25) & 0x1;
     cpuid->have_cmov_ = (edx >> 15) & 0x1;
@@ -253,6 +250,10 @@ struct CPUIDInfo {
     return false;
   }
 
+  string vendor_str() const { return vendor_str_; }
+  int family() const { return family_; }
+  int model_num() { return model_num_; }
+
  private:
   int highest_eax_;
   int have_adx_ : 1;
@@ -292,6 +293,9 @@ struct CPUIDInfo {
   int have_sse4_2_ : 1;
   int have_ssse3_ : 1;
   int have_hypervisor_ : 1;
+  string vendor_str_;
+  int family_;
+  int model_num_;
 };
 
 std::once_flag cpuid_once_flag;
@@ -317,7 +321,7 @@ bool TestCPUFeature(CPUFeature feature) {
 std::string CPUVendorIDString() {
 #ifdef PLATFORM_IS_X86
   InitCPUIDInfo();
-  return cpuid->vendor_str;
+  return cpuid->vendor_str();
 #else
   return "";
 #endif
@@ -326,7 +330,7 @@ std::string CPUVendorIDString() {
 int CPUFamily() {
 #ifdef PLATFORM_IS_X86
   InitCPUIDInfo();
-  return cpuid->family;
+  return cpuid->family();
 #else
   return 0;
 #endif
@@ -335,7 +339,7 @@ int CPUFamily() {
 int CPUModelNum() {
 #ifdef PLATFORM_IS_X86
   InitCPUIDInfo();
-  return cpuid->model_num;
+  return cpuid->model_num();
 #else
   return 0;
 #endif
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index 3394524aa5..85b53e07c4 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifdef TENSORFLOW_USE_JEMALLOC
+#include "jemalloc/jemalloc.h"
+#endif
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -53,16 +57,55 @@ int NumSchedulableCPUs() {
 }
 
 void* AlignedMalloc(size_t size, int minimum_alignment) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  void* ptr = NULL;
+  // posix_memalign requires that the requested alignment be at least
+  // sizeof(void*). In this case, fall back on malloc which should return
+  // memory aligned to at least the size of a pointer.
+  const int required_alignment = sizeof(void*);
+  if (minimum_alignment < required_alignment) return Malloc(size);
+  int err = jemalloc_posix_memalign(&ptr, minimum_alignment, size);
+  if (err != 0) {
+    return NULL;
+  } else {
+    return ptr;
+  }
+#else
   return _aligned_malloc(size, minimum_alignment);
+#endif
 }
 
-void AlignedFree(void* aligned_memory) { _aligned_free(aligned_memory); }
+void AlignedFree(void* aligned_memory) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  jemalloc_free(aligned_memory);
+#else
+  _aligned_free(aligned_memory);
+#endif
+}
 
-void* Malloc(size_t size) { return ::malloc(size); }
+void* Malloc(size_t size) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  return jemalloc_malloc(size);
+#else
+  return malloc(size);
+#endif
+}
 
-void* Realloc(void* ptr, size_t size) { return ::realloc(ptr, size); }
+void* Realloc(void* ptr, size_t size) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  return jemalloc_realloc(ptr, size);
+#else
+  return realloc(ptr, size);
+#endif
+}
 
-void Free(void* ptr) { ::free(ptr); }
+void Free(void* ptr) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  return jemalloc_free(ptr);
+#else
+  return free(ptr);
+#endif
+}
 
 void MallocExtension_ReleaseToSystem(std::size_t num_bytes) {
   // No-op.
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index abd5a16ed5..ebbe195bbc 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -16,6 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
 #define TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
 #ifdef INTEL_MKL
+
+#include <string>
+#include <vector>
+
 #include "third_party/mkl/include/mkl_dnn.h"
 #include "third_party/mkl/include/mkl_dnn_types.h"
 #include "third_party/mkl/include/mkl_service.h"
@@ -40,6 +44,8 @@ namespace tensorflow {
 // MKL operation, and did not go through a conversion to a standard
 // Tensorflow tensor.
 
+typedef enum { W = 0, H = 1, C = 2, N = 3 } MklDims;
+
 class MklShape {
  public:
   MklShape() {}
@@ -50,12 +56,15 @@ class MklShape {
     if (strides_) delete[] strides_;
     if (mklLayout_) CHECK_EQ(dnnLayoutDelete_F32(mklLayout_), E_SUCCESS);
     if (tfLayout_) CHECK_EQ(dnnLayoutDelete_F32(tfLayout_), E_SUCCESS);
+    if (tf_to_mkl_dim_map_) delete[] tf_to_mkl_dim_map_;
   }
 
   const bool IsMklTensor() const { return isMklTensor_; }
 
   void SetMklTensor(const bool isMklTensor) { isMklTensor_ = isMklTensor; }
 
+  void SetDimensions(const size_t dimension) { dimension_ = dimension; }
+
   void SetMklLayout(const void* primitive, size_t resourceType) {
     CHECK_EQ(
         dnnLayoutCreateFromPrimitive_F32(&mklLayout_, (dnnPrimitive_t)primitive,
@@ -66,7 +75,8 @@ class MklShape {
   void SetTfLayout(const size_t dimension, const size_t* sizes,
                    const size_t* strides) {
     dimension_ = dimension;
-    if (dimension > 0) {  // MKl doesn't support dimension 0
+
+    if (dimension > 0) {  // MKl doesn't support zero dimension tensors
       sizes_ = new size_t[dimension];
       strides_ = new size_t[dimension];
 
@@ -79,6 +89,45 @@ class MklShape {
     }
   }
 
+  // Default case - MKL dim ordering is opposite of TF dim ordering
+  // MKL -> (DIMS-1)...0 where (DIMS-1) is outermost dim and 0 is innermost dim
+  // TF  -> 0...(DIMS-1) where 0 is outermost dim and (DIMS-1) is innermost dim
+  // For layers that rely on data_format semantics (conv, pooling etc.)
+  // or operate only on certain dimensions (relu, concat, split etc.),
+  // Mkl APIs might require us to reorder these dimensions. In such cases,
+  // kernels should explicitly set this map
+  void SetTfDimOrder(const size_t dimension) {
+    CHECK(dimension == dimension_);
+    if (tf_to_mkl_dim_map_ == nullptr) {
+      tf_to_mkl_dim_map_ = new size_t[dimension];
+    }
+    for (size_t ii = 0; ii < dimension; ii++) {
+      tf_to_mkl_dim_map_[ii] = dimension - (ii + 1);
+    }
+  }
+
+  void SetTfDimOrder(const size_t dimension, const size_t* tf_to_mkl_dim_map) {
+    CHECK(dimension == dimension_);
+    if (tf_to_mkl_dim_map_ == nullptr) {
+      tf_to_mkl_dim_map_ = new size_t[dimension];
+    }
+    for (size_t ii = 0; ii < dimension; ii++) {
+      tf_to_mkl_dim_map_[ii] = tf_to_mkl_dim_map[ii];
+    }
+  }
+
+  void SetTfDimOrder(const size_t dimension, TensorFormat data_format) {
+    CHECK_EQ(dimension, 4);
+    CHECK(dimension == dimension_);
+    if (tf_to_mkl_dim_map_ == nullptr) {
+      tf_to_mkl_dim_map_ = new size_t[dimension];
+    }
+    tf_to_mkl_dim_map_[GetTensorDimIndex<2>(data_format, 'W')] = MklDims::W;
+    tf_to_mkl_dim_map_[GetTensorDimIndex<2>(data_format, 'H')] = MklDims::H;
+    tf_to_mkl_dim_map_[GetTensorDimIndex<2>(data_format, 'C')] = MklDims::C;
+    tf_to_mkl_dim_map_[GetTensorDimIndex<2>(data_format, 'N')] = MklDims::N;
+  }
+
   const dnnLayout_t GetMklLayout() const { return mklLayout_; }
   const dnnLayout_t GetTfLayout() const { return tfLayout_; }
   const dnnLayout_t GetCurLayout() const {
@@ -86,7 +135,10 @@ class MklShape {
   }
   size_t GetDimension() const { return dimension_; }
   const size_t* GetSizes() const { return sizes_; }
+  int64 dim_size(int index) const { return sizes_[index]; }
   const size_t* GetStrides() const { return strides_; }
+  const size_t* GetTfToMklDimMap() const { return tf_to_mkl_dim_map_; }
+  size_t tf_dim_idx(int index) const { return tf_to_mkl_dim_map_[index]; }
 
   void GetConvertedFlatData(dnnLayout_t targetLayout, void* input,
                             void* output) const {
@@ -107,21 +159,23 @@ class MklShape {
 // The data is serialized in this order
 // isMklTensor_
 // dimension_
-// sizes
-// strides
+// sizes_
+// strides_
 // mklLayout_
 // tfLayout_
+// tf_to_mkl_dim_map_
 
 #define SIZE_OF_MKL_DNN_BUF \
   (dnnLayoutSerializationBufferSize_F32())  // Size of buffer needed to
                                             // serialize dnn_layout pointer
 
 // Size of buffer to hold the serialized object, the size is computed as follows
-// sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes) + sizeof(strides)
+// sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes_) + sizeof(strides_)
 // + sizeof(mklLayout_ buffer) + sizeof(tfLayout_ buffer)
+// + sizeof(tf_to_mkl_dim_map_)
 
 #define SIZE_OF_MKL_SERIAL_DATA(dims) \
-  (2 * sizeof(size_t) + 2 * dims * sizeof(size_t) + 2 * SIZE_OF_MKL_DNN_BUF)
+  (2 * sizeof(size_t) + 3 * dims * sizeof(size_t) + 2 * SIZE_OF_MKL_DNN_BUF)
 
 // First we need to define some macro for offsets into the serial buffer where
 // different elements of Mklshape is written/read from
@@ -140,6 +194,9 @@ class MklShape {
   (STRIDES_OFFSET(dims) + dims * sizeof(size_t))  // Location of mklLayout_
 #define TF_LAYOUT_OFFSET(dims) \
   (MKL_LAYOUT_OFFSET(dims) + SIZE_OF_MKL_DNN_BUF)  // Location of tfLayout_
+// Location of tf_to_mkl_dim_map_
+#define TF_TO_MKL_DIM_MAP_OFFSET(dims) \
+  (TF_LAYOUT_OFFSET(dims) + SIZE_OF_MKL_DNN_BUF)
 
   // TODO(agramesh1) make sure to create a const to share with rewrite pass
   // for min size of MKL metadata tensor.
@@ -156,11 +213,14 @@ class MklShape {
           << "Bufsize too small in DeSerialize";
       sizes_ = new size_t[dimension_];
       strides_ = new size_t[dimension_];
+      tf_to_mkl_dim_map_ = new size_t[dimension_];
       for (int i = 0; i < dimension_; i++) {
         sizes_[i] =
             reinterpret_cast<const size_t*>(buf + SIZES_OFFSET(dimension_))[i];
         strides_[i] = reinterpret_cast<const size_t*>(
             buf + STRIDES_OFFSET(dimension_))[i];
+        tf_to_mkl_dim_map_[i] = reinterpret_cast<const size_t*>(
+            buf + TF_TO_MKL_DIM_MAP_OFFSET(dimension_))[i];
       }
       CHECK_EQ(dnnLayoutDeserialize_F32(&mklLayout_,
                                         buf + MKL_LAYOUT_OFFSET(dimension_)),
@@ -183,6 +243,9 @@ class MklShape {
             sizes_[i];
         reinterpret_cast<size_t*>(buf + STRIDES_OFFSET(dimension_))[i] =
             strides_[i];
+        reinterpret_cast<size_t*>(buf +
+                                  TF_TO_MKL_DIM_MAP_OFFSET(dimension_))[i] =
+            tf_to_mkl_dim_map_[i];
       }
       CHECK_EQ(dnnLayoutSerialize_F32(mklLayout_,
                                       buf + MKL_LAYOUT_OFFSET(dimension_)),
@@ -202,6 +265,8 @@ class MklShape {
   size_t dimension_ = 0;
   size_t* sizes_ = nullptr;    // Required by MKL for conversions
   size_t* strides_ = nullptr;  // Required by MKL for conversions
+  // TF dimension corresponding to this MKL dimension
+  size_t* tf_to_mkl_dim_map_ = nullptr;
 };
 
 int inline GetTensorDataIndex(int n) {
@@ -275,18 +340,78 @@ inline void GetStridesFromSizes(TensorFormat data_format, size_t* strides,
   }
 }
 
+inline void MklSizesToTFSizes(OpKernelContext* context,
+                              TensorFormat data_format_,
+                              const MklShape& mklshape, TensorShape* tfshape) {
+  size_t tf_dim = mklshape.GetDimension();
+  const size_t* tf_sizes = mklshape.GetSizes();
+
+  // TODO(agramesh1): check if this constraint is applicable in other cases
+  // (besides BackpropInput, BackpropFilter).
+  OP_REQUIRES(context, tf_dim == 4,
+              errors::InvalidArgument("MKLSizesToTFSizes: size must be 4-dim"));
+  std::vector<int32> sizes;
+
+  sizes.push_back(tf_sizes[3]);
+
+  if (data_format_ == FORMAT_NHWC) {
+    sizes.push_back(tf_sizes[1]);
+    sizes.push_back(tf_sizes[0]);
+    sizes.push_back(tf_sizes[2]);
+  } else {
+    sizes.push_back(tf_sizes[2]);
+    sizes.push_back(tf_sizes[1]);
+    sizes.push_back(tf_sizes[0]);
+  }
+
+  OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(sizes, tfshape));
+}
+
+inline int32 GetMklTensorDimIndex(char dimension) {
+  switch (dimension) {
+    case 'N':
+      return MklDims::N;
+    case 'C':
+      return MklDims::C;
+    case 'H':
+      return MklDims::H;
+    case 'W':
+      return MklDims::W;
+    default:
+      LOG(FATAL) << "Invalid dimension: " << dimension;
+      return -1;  // Avoid compiler warning about missing return value
+  }
+}
+
+inline int64 GetMklTensorDim(const MklShape& mklshape, char dimension) {
+  int index = GetMklTensorDimIndex(dimension);
+  CHECK(index >= 0 && index < mklshape.GetDimension())
+      << "Invalid index from the dimension: " << index << ", " << dimension;
+  return mklshape.dim_size(index);
+}
+
 namespace mkl_layer_registry {
 
 static const char* kMklLayerLabel = "MklLayer";
-static const string kMklLayerLabelPattern = "label='MklLayer'";
+static const char* kMklLayerLabelPattern = "label='MklLayer'";
 
-// Check whether opname is registered as MKL-compliant in the registry.
+// Check whether opname with type T is registered as MKL-compliant.
 //
 // @input: name of the op
+// @input: T datatype to be used for checking op
 // @return: true if opname is registered as Mkl layer op
-static inline bool IsMklLayer(const std::string& op_name) {
+static inline bool IsMklLayer(const std::string& op_name, DataType T) {
   string kernel = KernelsRegisteredForOp(op_name);
-  return kernel.find(kMklLayerLabelPattern) != string::npos;
+  // Currently, MKL only supports float type for ops. So we check if
+  // the type is float. Actually, we should query kernel registration and
+  // find out if op is supported for type T. But there is no API to query
+  // kernel registration using name and type.
+  bool result =
+      (kernel.find(kMklLayerLabelPattern) != string::npos) && (T == DT_FLOAT);
+  if (result == true) {
+    VLOG(1) << "mkl_layer_registry::" << op_name << " is " << kMklLayerLabel;
+  }
+  return result;
 }
 
 }  // namespace mkl_layer_registry
diff --git a/tensorflow/docs_src/about/roadmap.md b/tensorflow/docs_src/about/roadmap.md
index 76c734830a..1789e050fa 100644
--- a/tensorflow/docs_src/about/roadmap.md
+++ b/tensorflow/docs_src/about/roadmap.md
@@ -12,9 +12,8 @@ we do not have timelines for these features.
 
 ### Improve non-Python language support
 
-* Improve C++ API for graph construction and gradients
-* Java language support
-* Go language support
+* Support for adding gradient computation for graphs constructed in other
+  languages (C++, Java, Go etc.)
 
 ### Making TensorFlow easier to use
 * High-level APIs
diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index 4fc4c2faa2..45f7530506 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -229,7 +229,7 @@ do the following to run it from Python :
 
 ```python
 import tensorflow as tf
-zero_out_module = tf.load_op_library('zero_out.so')
+zero_out_module = tf.load_op_library('./zero_out.so')
 with tf.Session(''):
   zero_out_module.zero_out([[1, 2], [3, 4]]).eval()
 
@@ -243,14 +243,13 @@ named `ZeroOut` in the C++ files, the python function will be called `zero_out`.
 
 To make the op available as a regular function `import`-able from a Python
 module, it maybe useful to have the `load_op_library` call in a Python source
-file as follows (see [zero_out_op_1.py](https://www.tensorflow.org/code/tensorflow/examples/adding_an_op/zero_out_op_1.py))
-:
+file as follows:
 
 ```python
 import tensorflow as tf
 
-_zero_out_module = tf.load_op_library('zero_out_op_kernel_1.so')
-zero_out = _zero_out_module.zero_out
+zero_out_module = tf.load_op_library('./zero_out.so')
+zero_out = zero_out_module.zero_out
 ```
 
 ## Verify that the op works
@@ -264,7 +263,7 @@ import tensorflow as tf
 
 class ZeroOutTest(tf.test.TestCase):
   def testZeroOut(self):
-    zero_out_module = tf.load_op_library('zero_out.so')
+    zero_out_module = tf.load_op_library('./zero_out.so')
     with self.test_session():
       result = zero_out_module.zero_out([5, 4, 3, 2, 1])
       self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0])
diff --git a/tensorflow/docs_src/get_started/get_started.md b/tensorflow/docs_src/get_started/get_started.md
index b71249de0a..04ac3f5848 100644
--- a/tensorflow/docs_src/get_started/get_started.md
+++ b/tensorflow/docs_src/get_started/get_started.md
@@ -71,7 +71,7 @@ is a constant. Like all TensorFlow constants, it takes no inputs, and it outputs
 a value it stores internally. We can create two floating point Tensors `node1`
 and `node2` as follows:
 ```python
-node1 = tf.constant(3.0, tf.float32)
+node1 = tf.constant(3.0, dtype=tf.float32)
 node2 = tf.constant(4.0) # also tf.float32 implicitly
 print(node1, node2)
 ```
@@ -110,7 +110,7 @@ print("sess.run(node3): ",sess.run(node3))
 ```
 The last two print statements produce
 ```
-node3:  Tensor("Add_2:0", shape=(), dtype=float32)
+node3:  Tensor("Add:0", shape=(), dtype=float32)
 sess.run(node3):  7.0
 ```
 
@@ -173,8 +173,8 @@ initial value:
 
 
 ```python
-W = tf.Variable([.3], tf.float32)
-b = tf.Variable([-.3], tf.float32)
+W = tf.Variable([.3], dtype=tf.float32)
+b = tf.Variable([-.3], dtype=tf.float32)
 x = tf.placeholder(tf.float32)
 linear_model = W * x + b
 ```
@@ -294,8 +294,8 @@ import numpy as np
 import tensorflow as tf
 
 # Model parameters
-W = tf.Variable([.3], tf.float32)
-b = tf.Variable([-.3], tf.float32)
+W = tf.Variable([.3], dtype=tf.float32)
+b = tf.Variable([-.3], dtype=tf.float32)
 # Model input and output
 x = tf.placeholder(tf.float32)
 linear_model = W * x + b
diff --git a/tensorflow/docs_src/tutorials/wide.md b/tensorflow/docs_src/tutorials/wide.md
index 471811ea1a..1b72ba0746 100644
--- a/tensorflow/docs_src/tutorials/wide.md
+++ b/tensorflow/docs_src/tutorials/wide.md
@@ -27,7 +27,7 @@ https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/learn/w
        # Mac OS X
        $ sudo easy_install pip
        $ sudo easy_install --upgrade six
-      ```
+       ```
 
     2. Use `pip` to install pandas:
 
diff --git a/tensorflow/examples/android/README.md b/tensorflow/examples/android/README.md
index 81a2a66617..0414566b98 100644
--- a/tensorflow/examples/android/README.md
+++ b/tensorflow/examples/android/README.md
@@ -32,9 +32,7 @@ on API >= 14 devices.
         (https://arxiv.org/abs/1610.07629) to restyle the camera preview image
         to that of a number of different artists.
 
-<img src="sample_images/classify1.jpg" width="30%">
-<img src="sample_images/stylize1.jpg" width="30%">
-<img src="sample_images/detect1.jpg" width="30%">
+<img src="sample_images/classify1.jpg" width="30%"><img src="sample_images/stylize1.jpg" width="30%"><img src="sample_images/detect1.jpg" width="30%">
 
 ## Prebuilt APK:
 
@@ -83,7 +81,7 @@ instead.
 Bazel is the primary build system for TensorFlow. To build with Bazel,
 it and the Android NDK and SDK must be installed on your system.
 
-1. Get the recommended Bazel version listed in [os_setup.html](https://www.tensorflow.org/versions/master/get_started/os_setup.html#source)
+1. Install the latest version of Bazel as per the instructions [on the Bazel website](https://bazel.build/versions/master/docs/install.html).
 2. The Android NDK is required to build the native (C/C++) TensorFlow code.
         The current recommended version is 12b, which may be found
         [here](https://developer.android.com/ndk/downloads/older_releases.html#ndk-12b-downloads).
@@ -96,7 +94,7 @@ it and the Android NDK and SDK must be installed on your system.
 
 ##### Edit WORKSPACE
 
-The Android entries in [`<workspace_root>/WORKSPACE`](../../../WORKSPACE#L2-L13)
+The Android entries in [`<workspace_root>/WORKSPACE`](../../../WORKSPACE#L19-L32)
 must be uncommented with the paths filled in appropriately depending on where
 you installed the NDK and SDK. Otherwise an error such as:
 "The external label '//external:android/sdk' is not bound to anything" will
diff --git a/tensorflow/examples/android/build.gradle b/tensorflow/examples/android/build.gradle
index ed05a083a9..4f241027f4 100644
--- a/tensorflow/examples/android/build.gradle
+++ b/tensorflow/examples/android/build.gradle
@@ -67,7 +67,7 @@ apply plugin: 'com.android.application'
 
 android {
     compileSdkVersion 23
-    buildToolsVersion "25.0.1"
+    buildToolsVersion "25.0.2"
 
     lintOptions {
         abortOnError false
diff --git a/tensorflow/examples/tutorials/deepdream/deepdream.ipynb b/tensorflow/examples/tutorials/deepdream/deepdream.ipynb
index 016b21cd12..4ff8e368c4 100644
--- a/tensorflow/examples/tutorials/deepdream/deepdream.ipynb
+++ b/tensorflow/examples/tutorials/deepdream/deepdream.ipynb
@@ -278,7 +278,7 @@
     "            tensor = n.attr['value'].tensor\n",
     "            size = len(tensor.tensor_content)\n",
     "            if size > max_const_size:\n",
-    "                tensor.tensor_content = bytes(\"<stripped %d bytes>\"%size)\n",
+    "                tensor.tensor_content = tf.compat.as_bytes(\"<stripped %d bytes>\"%size)\n",
     "    return strip_def\n",
     "  \n",
     "def rename_nodes(graph_def, rename_func):\n",
diff --git a/tensorflow/examples/tutorials/monitors/iris_monitors.py b/tensorflow/examples/tutorials/monitors/iris_monitors.py
index a4bf353856..850d105f7b 100644
--- a/tensorflow/examples/tutorials/monitors/iris_monitors.py
+++ b/tensorflow/examples/tutorials/monitors/iris_monitors.py
@@ -21,7 +21,6 @@ import os
 
 import numpy as np
 import tensorflow as tf
-from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 
 tf.logging.set_verbosity(tf.logging.INFO)
 
@@ -41,18 +40,15 @@ def main(unused_argv):
       "accuracy":
           tf.contrib.learn.MetricSpec(
               metric_fn=tf.contrib.metrics.streaming_accuracy,
-              prediction_key=
-              tf.contrib.learn.prediction_key.PredictionKey.CLASSES),
+              prediction_key="classes"),
       "precision":
           tf.contrib.learn.MetricSpec(
               metric_fn=tf.contrib.metrics.streaming_precision,
-              prediction_key=
-              tf.contrib.learn.prediction_key.PredictionKey.CLASSES),
+              prediction_key="classes"),
       "recall":
           tf.contrib.learn.MetricSpec(
               metric_fn=tf.contrib.metrics.streaming_recall,
-              prediction_key=
-              tf.contrib.learn.prediction_key.PredictionKey.CLASSES)
+              prediction_key="classes")
   }
   validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
       test_set.data,
@@ -66,26 +62,6 @@ def main(unused_argv):
   # Specify that all features have real-value data
   feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
 
-  validation_metrics = {
-      "accuracy": MetricSpec(
-                          metric_fn=tf.contrib.metrics.streaming_accuracy,
-                          prediction_key="classes"),
-      "recall": MetricSpec(
-                          metric_fn=tf.contrib.metrics.streaming_recall,
-                          prediction_key="classes"),
-      "precision": MetricSpec(
-                          metric_fn=tf.contrib.metrics.streaming_precision,
-                          prediction_key="classes")
-                        }
-  validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
-      test_set.data,
-      test_set.target,
-      every_n_steps=50,
-      metrics=validation_metrics,
-      early_stopping_metric="loss",
-      early_stopping_metric_minimize=True,
-      early_stopping_rounds=200)
-
   # Build 3 layer DNN with 10, 20, 10 units respectively.
   classifier = tf.contrib.learn.DNNClassifier(
       feature_columns=feature_columns,
diff --git a/tensorflow/go/doc.go b/tensorflow/go/doc.go
index 79fbf9797e..a59652b160 100644
--- a/tensorflow/go/doc.go
+++ b/tensorflow/go/doc.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 // Package tensorflow is a Go binding to TensorFlow.
 //
diff --git a/tensorflow/go/example_inception_inference_test.go b/tensorflow/go/example_inception_inference_test.go
index 42d169ed9a..682bd245cc 100644
--- a/tensorflow/go/example_inception_inference_test.go
+++ b/tensorflow/go/example_inception_inference_test.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow_test
 
@@ -26,8 +28,8 @@ import (
 	"os"
 	"path/filepath"
 
-	tf "github.com/tensorflow/tensorflow/tensorflow/go"
 	"github.com/tensorflow/tensorflow/tensorflow/go/op"
+	tf "github.com/tensorflow/tensorflow/tensorflow/go"
 )
 
 func Example() {
diff --git a/tensorflow/go/genop/internal/genop.go b/tensorflow/go/genop/internal/genop.go
index d17c1ca41d..dec08dee1c 100644
--- a/tensorflow/go/genop/internal/genop.go
+++ b/tensorflow/go/genop/internal/genop.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 // Package internal generates Go source code with functions for TensorFlow operations.
 //
@@ -156,12 +158,12 @@ func makeOutputList(op *tf.Operation, start int, output string) ([]tf.Output, in
 `))
 
 	tmplOp = template.Must(template.New("op").Funcs(template.FuncMap{
-		"MakeComment": makeComment,
-		"GoType":      goType,
-		"CamelCase":   camelCase,
-		"Identifier":  identifier,
-		"IsListArg":   isListArg,
-		"IsListAttr":  isListAttr,
+		"MakeComment":       makeComment,
+		"GoType":            goType,
+		"CamelCase":         camelCase,
+		"Identifier":        identifier,
+		"IsListArg":         isListArg,
+		"IsListAttr":        isListAttr,
 		"StripLeadingColon": stripLeadingColon,
 	}).Parse(`
 {{if .OptionalAttrs -}}
diff --git a/tensorflow/go/genop/internal/genop_test.go b/tensorflow/go/genop/internal/genop_test.go
index 00ac4827e4..c984c0063a 100644
--- a/tensorflow/go/genop/internal/genop_test.go
+++ b/tensorflow/go/genop/internal/genop_test.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package internal
 
diff --git a/tensorflow/go/genop/internal/lib.go b/tensorflow/go/genop/internal/lib.go
index ed902f8b4d..71e8c1c93f 100644
--- a/tensorflow/go/genop/internal/lib.go
+++ b/tensorflow/go/genop/internal/lib.go
@@ -1,17 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 package internal
 
 // #cgo LDFLAGS: -ltensorflow
diff --git a/tensorflow/go/genop/main.go b/tensorflow/go/genop/main.go
index 46163ef0ad..b6f8e2d5a8 100644
--- a/tensorflow/go/genop/main.go
+++ b/tensorflow/go/genop/main.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 //go:generate sh generate.sh
 
diff --git a/tensorflow/go/graph.go b/tensorflow/go/graph.go
index c64ba84432..e65619e80b 100644
--- a/tensorflow/go/graph.go
+++ b/tensorflow/go/graph.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
diff --git a/tensorflow/go/graph_test.go b/tensorflow/go/graph_test.go
index 43f80ff4eb..c3120bc720 100644
--- a/tensorflow/go/graph_test.go
+++ b/tensorflow/go/graph_test.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
diff --git a/tensorflow/go/lib.go b/tensorflow/go/lib.go
index 7f96c7809a..551cfa0b01 100644
--- a/tensorflow/go/lib.go
+++ b/tensorflow/go/lib.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
diff --git a/tensorflow/go/op/generate.go b/tensorflow/go/op/generate.go
index ed35964969..17ece1c7a2 100644
--- a/tensorflow/go/op/generate.go
+++ b/tensorflow/go/op/generate.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 //go:generate go generate ../genop
 //go:generate go run ../genop/main.go -outfile wrappers.go
diff --git a/tensorflow/go/op/op.go b/tensorflow/go/op/op.go
index 29c5998724..1c20bd441a 100644
--- a/tensorflow/go/op/op.go
+++ b/tensorflow/go/op/op.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 // Package op defines functions for adding TensorFlow operations to a Graph.
 //
diff --git a/tensorflow/go/op/op_test.go b/tensorflow/go/op/op_test.go
index eaa27bfcd0..65877dca96 100644
--- a/tensorflow/go/op/op_test.go
+++ b/tensorflow/go/op/op_test.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 // Tests for the generated code of some operations.
 
diff --git a/tensorflow/go/op/scope.go b/tensorflow/go/op/scope.go
index c9fc432cd2..d87833f451 100644
--- a/tensorflow/go/op/scope.go
+++ b/tensorflow/go/op/scope.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package op
 
diff --git a/tensorflow/go/op/scope_test.go b/tensorflow/go/op/scope_test.go
index 0c3825c178..b74fd24b26 100644
--- a/tensorflow/go/op/scope_test.go
+++ b/tensorflow/go/op/scope_test.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package op
 
diff --git a/tensorflow/go/operation.go b/tensorflow/go/operation.go
index 9c035e5e18..e8f67c4f73 100644
--- a/tensorflow/go/operation.go
+++ b/tensorflow/go/operation.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
diff --git a/tensorflow/go/operation_test.go b/tensorflow/go/operation_test.go
index a5e36f6683..7cba043af2 100644
--- a/tensorflow/go/operation_test.go
+++ b/tensorflow/go/operation_test.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
diff --git a/tensorflow/go/saved_model.go b/tensorflow/go/saved_model.go
index 32e40d9a95..7aeaaec942 100644
--- a/tensorflow/go/saved_model.go
+++ b/tensorflow/go/saved_model.go
@@ -1,16 +1,18 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
diff --git a/tensorflow/go/saved_model_test.go b/tensorflow/go/saved_model_test.go
index bc4d8e1b90..5f6f70c3ef 100644
--- a/tensorflow/go/saved_model_test.go
+++ b/tensorflow/go/saved_model_test.go
@@ -1,16 +1,18 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
diff --git a/tensorflow/go/session.cpp b/tensorflow/go/session.cpp
index 9f6fd1f341..efa225505b 100644
--- a/tensorflow/go/session.cpp
+++ b/tensorflow/go/session.cpp
@@ -1,16 +1,18 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 // TODO(ashankar): Remove this file when TensorFlow 1.1 is released.
 // See lib.go for details.
diff --git a/tensorflow/go/session.go b/tensorflow/go/session.go
index 5a6e1e37ad..3add412dcd 100644
--- a/tensorflow/go/session.go
+++ b/tensorflow/go/session.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
diff --git a/tensorflow/go/session_test.go b/tensorflow/go/session_test.go
index 4c1b862e1f..73d78a8e57 100644
--- a/tensorflow/go/session_test.go
+++ b/tensorflow/go/session_test.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
diff --git a/tensorflow/go/shape.go b/tensorflow/go/shape.go
index c48bbf29a3..114ab5decb 100644
--- a/tensorflow/go/shape.go
+++ b/tensorflow/go/shape.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
diff --git a/tensorflow/go/shape_test.go b/tensorflow/go/shape_test.go
index f8f3d4e94b..94ffd27162 100644
--- a/tensorflow/go/shape_test.go
+++ b/tensorflow/go/shape_test.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
diff --git a/tensorflow/go/status.go b/tensorflow/go/status.go
index a1f7ed5481..b4df83665a 100644
--- a/tensorflow/go/status.go
+++ b/tensorflow/go/status.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index f96e796e5e..34e797a2b3 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
diff --git a/tensorflow/go/tensor_test.go b/tensorflow/go/tensor_test.go
index 9a87923830..2fc7553f87 100644
--- a/tensorflow/go/tensor_test.go
+++ b/tensorflow/go/tensor_test.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
diff --git a/tensorflow/go/util_test.go b/tensorflow/go/util_test.go
index 492c3b1e8b..2bec954246 100644
--- a/tensorflow/go/util_test.go
+++ b/tensorflow/go/util_test.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
diff --git a/tensorflow/go/version.go b/tensorflow/go/version.go
index c777c44bea..7de909d036 100644
--- a/tensorflow/go/version.go
+++ b/tensorflow/go/version.go
@@ -1,16 +1,18 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
 
 package tensorflow
 
diff --git a/tensorflow/java/maven/README.md b/tensorflow/java/maven/README.md
index 2eb29a200b..19a214f42d 100644
--- a/tensorflow/java/maven/README.md
+++ b/tensorflow/java/maven/README.md
@@ -68,28 +68,28 @@ conducted in a [Docker](https://www.docker.com) container.
     SONATYPE_PASSWORD="your_sonatype.org_password_here"
     GPG_PASSPHRASE="your_gpg_passphrase_here"
     cat >/tmp/settings.xml <<EOF
-<settings>
-  <servers>
-    <server>
-      <id>ossrh</id>
-      <username>${SONATYPE_USERNAME}</username>
-      <password>${SONATYPE_PASSWORD}</password>
-    </server>
-  </servers>
-  <profiles>
-    <profile>
-      <id>ossrh</id>
-      <activation>
-        <activeByDefault>true</activeByDefault>
-      </activation>
-      <properties>
-        <gpg.executable>gpg2</gpg.executable>
-        <gpg.passphrase>${GPG_PASSPHRASE}</gpg.passphrase>
-      </properties>
-    </profile>
-  </profiles>
-</settings>
-EOF
+    <settings>
+      <servers>
+        <server>
+          <id>ossrh</id>
+          <username>${SONATYPE_USERNAME}</username>
+          <password>${SONATYPE_PASSWORD}</password>
+        </server>
+      </servers>
+      <profiles>
+        <profile>
+          <id>ossrh</id>
+          <activation>
+            <activeByDefault>true</activeByDefault>
+          </activation>
+          <properties>
+            <gpg.executable>gpg2</gpg.executable>
+            <gpg.passphrase>${GPG_PASSPHRASE}</gpg.passphrase>
+          </properties>
+        </profile>
+      </profiles>
+    </settings>
+    EOF
     ```
 
 2.  Run the `release.sh` script.
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 9891ae4eaf..36918af552 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -266,7 +266,11 @@ class Estimator(object):
         checkpoint_path=checkpoint_path,
         name=name)
 
-  def predict(self, input_fn, predict_keys=None, hooks=None):
+  def predict(self,
+              input_fn,
+              predict_keys=None,
+              hooks=None,
+              checkpoint_path=None):
     """Returns predictions for given features.
 
     Args:
@@ -281,6 +285,8 @@ class Estimator(object):
         `None`, returns all.
       hooks: List of `SessionRunHook` subclass instances. Used for callbacks
         inside the prediction call.
+      checkpoint_path: Path of a specific checkpoint to predict. If `None`, the
+        latest checkpoint in `model_dir` is used.
 
     Yields:
       Evaluated values of `predictions` tensors.
@@ -294,7 +300,8 @@ class Estimator(object):
     """
     hooks = _check_hooks_type(hooks)
     # Check that model has been trained.
-    checkpoint_path = saver.latest_checkpoint(self._model_dir)
+    if not checkpoint_path:
+      checkpoint_path = saver.latest_checkpoint(self._model_dir)
     if not checkpoint_path:
       raise ValueError('Could not find trained model in model_dir: {}.'.format(
           self._model_dir))
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index 398ff20b6b..a1659156a6 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -618,12 +618,20 @@ class EstimatorEvaluateTest(test.TestCase):
 
 class EstimatorPredictTest(test.TestCase):
 
-  def test_no_trained_model(self):
+  def test_no_trained_model_in_model_dir(self):
     est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
     with self.assertRaisesRegexp(ValueError,
                                  'Could not find trained model in model_dir'):
       next(est.predict(dummy_input_fn))
 
+  def test_no_trained_model_invalid_checkpoint_path(self):
+    est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
+    with self.assertRaises(ValueError):
+      next(
+          est.predict(
+              dummy_input_fn,
+              checkpoint_path=saver.latest_checkpoint('fakedir')))
+
   def test_tensor_predictions(self):
 
     def _model_fn(features, labels, mode):
@@ -828,6 +836,28 @@ class EstimatorPredictTest(test.TestCase):
     est2 = estimator.Estimator(model_fn=_model_fn, model_dir=est1.model_dir)
     self.assertEqual([32.], next(est2.predict(dummy_input_fn)))
 
+  def test_predict_from_checkpoint_path(self):
+
+    def _model_fn(features, labels, mode):
+      _, _ = features, labels
+      v = variables.Variable([[16.]], name='weight')
+      prediction = v * 2
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(0.),
+          train_op=constant_op.constant(0.),
+          predictions=prediction)
+
+    est1 = estimator.Estimator(model_fn=_model_fn)
+    est1.train(dummy_input_fn, steps=1)
+    est2 = estimator.Estimator(model_fn=_model_fn, model_dir=est1.model_dir)
+    self.assertEqual(
+        [32.],
+        next(
+            est2.predict(
+                dummy_input_fn,
+                checkpoint_path=saver.latest_checkpoint(est1.model_dir))))
+
   def test_scaffold_is_used(self):
 
     def _model_fn_scaffold(features, labels, mode):
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions.py b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
index 9da2bce0f8..a6f5157680 100644
--- a/tensorflow/python/estimator/inputs/queues/feeding_functions.py
+++ b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
@@ -20,7 +20,9 @@ from __future__ import print_function
 
 import collections
 import random
+import types as tp
 import numpy as np
+import six
 
 from tensorflow.python.estimator.inputs.queues import feeding_queue_runner as fqr
 from tensorflow.python.framework import dtypes
@@ -218,6 +220,54 @@ class _PandasFeedFn(object):
     return feed_dict
 
 
+class _GeneratorFeedFn(object):
+  """Creates feed dictionaries from `Generator` of `dicts` of numpy arrays."""
+
+  def __init__(self,
+               placeholders,
+               generator,
+               batch_size,
+               random_start=False,
+               seed=None,
+               num_epochs=None):
+    first_sample = next(generator())
+    if len(placeholders) != len(first_sample):
+      raise ValueError("Expected {} placeholders; got {}.".format(
+          len(first_sample), len(placeholders)))
+    self._keys = sorted(list(first_sample.keys()))
+    self._col_placeholders = placeholders
+    self._generator_function = generator
+    self._iterator = generator()
+    self._batch_size = batch_size
+    self._num_epochs = num_epochs
+    self._epoch = 0
+    random.seed(seed)
+
+  def __call__(self):
+    if self._num_epochs and self._epoch >= self._num_epochs:
+      raise errors.OutOfRangeError(None, None,
+                                   "Already emitted %s epochs." % self._epoch)
+    list_dict = {}
+    list_dict_size = 0
+    while list_dict_size < self._batch_size:
+      try:
+        data_row = next(self._iterator)
+      except StopIteration:
+        self._epoch += 1
+        self._iterator = self._generator_function()
+        data_row = next(self._iterator)
+      for index, key in enumerate(self._keys):
+        if key not in data_row.keys():
+          raise KeyError("key mismatch between dicts emitted by GenFun"
+                         "Expected {} keys; got {}".format(
+                             self._keys, data_row.keys()))
+        list_dict.setdefault(self._col_placeholders[index],
+                             list()).append(data_row[key])
+        list_dict_size += 1
+    feed_dict = {key: np.asarray(item) for key, item in list(list_dict.items())}
+    return feed_dict
+
+
 def _enqueue_data(data,
                   capacity,
                   shuffle=False,
@@ -235,8 +285,9 @@ def _enqueue_data(data,
     numpy arrays, the first enqueued `Tensor` contains the row number.
 
   Args:
-    data: a numpy `ndarray`, `OrderedDict` of numpy arrays, or pandas
-      `DataFrame` that will be read into the queue.
+    data: a numpy `ndarray`, `OrderedDict` of numpy arrays, or a generator
+       yielding `dict`s of numpy arrays  or pandas `DataFrame` that will be read
+       into the queue.
     capacity: the capacity of the queue.
     shuffle: whether or not to shuffle the rows of the array.
     min_after_dequeue: minimum number of elements that can remain in the queue
@@ -254,7 +305,7 @@ def _enqueue_data(data,
 
   Raises:
     TypeError: `data` is not a Pandas `DataFrame`, an `OrderedDict` of numpy
-      arrays  or a numpy `ndarray`.
+      arrays, a numpy `ndarray`, or a generator producing these.
   """
   with ops.name_scope(name):
     if isinstance(data, np.ndarray):
@@ -267,6 +318,13 @@ def _enqueue_data(data,
       ]
       queue_shapes = [()] + [col.shape[1:] for col in data.values()]
       get_feed_fn = _OrderedDictNumpyFeedFn
+    elif isinstance(data, tp.FunctionType):
+      x_first_el = six.next(data())
+      x_first_keys = sorted(x_first_el.keys())
+      x_first_values = [x_first_el[key] for key in x_first_keys]
+      types = [dtypes.as_dtype(col.dtype) for col in x_first_values]
+      queue_shapes = [col.shape for col in x_first_values]
+      get_feed_fn = _GeneratorFeedFn
     elif HAS_PANDAS and isinstance(data, pd.DataFrame):
       types = [
           dtypes.as_dtype(dt) for dt in [data.index.dtype] + list(data.dtypes)
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 13262a0bd8..c2378ac4b2 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -355,6 +355,10 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
       nparray = values.astype(dtype.as_numpy_dtype)
     else:
       nparray = values
+  elif callable(getattr(values, "__array__", None)):
+    # If a class has the __array__ method, then it is possible to convert
+    # to numpy array.
+    nparray = np.asarray(values, dtype=dtype)
   else:
     if values is None:
       raise ValueError("None values not supported.")
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index 47d3681a1f..dfefc27f99 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -23,6 +23,7 @@ import sys
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -47,13 +48,13 @@ class TensorUtilTest(test.TestCase):
 
   def testFloatN(self):
     t = tensor_util.make_tensor_proto([10.0, 20.0, 30.0])
-    if sys.byteorder == "big":  
+    if sys.byteorder == "big":
       self.assertProtoEquals("""  
         dtype: DT_FLOAT  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "A \000\000A\240\000\000A\360\000\000"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_FLOAT
         tensor_shape { dim { size: 3 } }
@@ -65,12 +66,12 @@ class TensorUtilTest(test.TestCase):
 
   def testFloatTyped(self):
     t = tensor_util.make_tensor_proto([10.0, 20.0, 30.0], dtype=dtypes.float32)
-    if sys.byteorder == "big":  
+    if sys.byteorder == "big":
       self.assertProtoEquals("""  
         dtype: DT_FLOAT  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "A \000\000A\240\000\000A\360\000\000"  
-        """, t)  
+        """, t)
     else:
       self.assertProtoEquals("""
         dtype: DT_FLOAT
@@ -83,13 +84,13 @@ class TensorUtilTest(test.TestCase):
 
   def testFloatTypeCoerce(self):
     t = tensor_util.make_tensor_proto([10, 20, 30], dtype=dtypes.float32)
-    if sys.byteorder == "big":  
+    if sys.byteorder == "big":
       self.assertProtoEquals("""  
         dtype: DT_FLOAT  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "A \000\000A\240\000\000A\360\000\000"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_FLOAT
         tensor_shape { dim { size: 3 } }
@@ -102,13 +103,13 @@ class TensorUtilTest(test.TestCase):
   def testFloatTypeCoerceNdarray(self):
     arr = np.asarray([10, 20, 30], dtype="int")
     t = tensor_util.make_tensor_proto(arr, dtype=dtypes.float32)
-    if sys.byteorder == "big":  
+    if sys.byteorder == "big":
       self.assertProtoEquals("""  
         dtype: DT_FLOAT  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "A \000\000A\240\000\000A\360\000\000"  
-        """, t)  
-    else: 
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_FLOAT
         tensor_shape { dim { size: 3 } }
@@ -120,13 +121,13 @@ class TensorUtilTest(test.TestCase):
 
   def testFloatSizes(self):
     t = tensor_util.make_tensor_proto([10.0, 20.0, 30.0], shape=[1, 3])
-    if sys.byteorder == "big":  
+    if sys.byteorder == "big":
       self.assertProtoEquals("""  
         dtype: DT_FLOAT  
         tensor_shape { dim { size: 1 } dim { size: 3 } }  
         tensor_content: "A \000\000A\240\000\000A\360\000\000"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_FLOAT
         tensor_shape { dim { size: 1 } dim { size: 3 } }
@@ -138,13 +139,13 @@ class TensorUtilTest(test.TestCase):
 
   def testFloatSizes2(self):
     t = tensor_util.make_tensor_proto([10.0, 20.0, 30.0], shape=[3, 1])
-    if sys.byteorder == "big":  
+    if sys.byteorder == "big":
       self.assertProtoEquals("""  
         dtype: DT_FLOAT  
         tensor_shape { dim { size: 3 } dim { size: 1 } }  
         tensor_content: "A \000\000A\240\000\000A\360\000\000"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_FLOAT
         tensor_shape { dim { size: 3 } dim { size: 1 } }
@@ -166,13 +167,13 @@ class TensorUtilTest(test.TestCase):
   def testFloatNpArrayFloat64(self):
     t = tensor_util.make_tensor_proto(
         np.array([[10.0, 20.0, 30.0]], dtype=np.float64))
-    if sys.byteorder == "big":  
+    if sys.byteorder == "big":
       self.assertProtoEquals("""  
         dtype: DT_DOUBLE  
         tensor_shape { dim { size: 1 } dim { size: 3 } }  
         tensor_content: "@$\000\000\000\000\000\000@4\000\000\000\000\000\000@>\000\000\000\000\000\000"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_DOUBLE
         tensor_shape { dim { size: 1 } dim { size: 3 } }
@@ -257,13 +258,13 @@ class TensorUtilTest(test.TestCase):
 
   def testIntNDefaultType(self):
     t = tensor_util.make_tensor_proto([10, 20, 30, 40], shape=[2, 2])
-    if sys.byteorder == "big":  
+    if sys.byteorder == "big":
       self.assertProtoEquals("""  
         dtype: DT_INT32  
         tensor_shape { dim { size: 2 } dim { size: 2 } }  
         tensor_content: "\000\000\000\\n\000\000\000\024\000\000\000\036\000\000\000("  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_INT32
         tensor_shape { dim { size: 2 } dim { size: 2 } }
@@ -327,13 +328,13 @@ class TensorUtilTest(test.TestCase):
   def testLongN(self):
     t = tensor_util.make_tensor_proto(
         [10, 20, 30], shape=[1, 3], dtype=dtypes.int64)
-    if sys.byteorder == "big":  
+    if sys.byteorder == "big":
       self.assertProtoEquals("""  
         dtype: DT_INT64  
         tensor_shape { dim { size: 1 } dim { size: 3 } }  
         tensor_content: "\000\000\000\000\000\000\000\\n\000\000\000\000\000\000\000\024\000\000\000\000\000\000\000\036"  
-        """, t)  
-    else: 
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_INT64
         tensor_shape { dim { size: 1 } dim { size: 3 } }
@@ -345,13 +346,13 @@ class TensorUtilTest(test.TestCase):
 
   def testLongNpArray(self):
     t = tensor_util.make_tensor_proto(np.array([10, 20, 30]))
-    if sys.byteorder == "big":  
+    if sys.byteorder == "big":
       self.assertProtoEquals("""  
         dtype: DT_INT64  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "\000\000\000\000\000\000\000\\n\000\000\000\000\000\000\000\024\000\000\000\000\000\000\000\036"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_INT64
         tensor_shape { dim { size: 3 } }
@@ -366,13 +367,13 @@ class TensorUtilTest(test.TestCase):
     data = [(21,), (22,), (23,)]
 
     t = tensor_util.make_tensor_proto(data, dtype=dtypes.qint32)
-    if sys.byteorder == "big":  
+    if sys.byteorder == "big":
       self.assertProtoEquals("""  
         dtype: DT_QINT32  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "\000\000\000\025\000\000\000\026\000\000\000\027"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_QINT32
         tensor_shape { dim { size: 3 } }
@@ -403,13 +404,13 @@ class TensorUtilTest(test.TestCase):
     self.assertAllEqual(np.array(data, dtype=a.dtype), a)
 
     t = tensor_util.make_tensor_proto(data, dtype=dtypes.quint16)
-    if sys.byteorder == "big":  
+    if sys.byteorder == "big":
       self.assertProtoEquals("""  
         dtype: DT_QUINT16  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "\000\025\000\026\000\027"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_QUINT16
         tensor_shape { dim { size: 3 } }
@@ -420,13 +421,13 @@ class TensorUtilTest(test.TestCase):
     self.assertAllEqual(np.array(data, dtype=a.dtype), a)
 
     t = tensor_util.make_tensor_proto(data, dtype=dtypes.qint16)
-    if sys.byteorder == "big":  
+    if sys.byteorder == "big":
       self.assertProtoEquals("""  
         dtype: DT_QINT16  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "\000\025\000\026\000\027"  
-        """, t)  
-    else: 
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_QINT16
         tensor_shape { dim { size: 3 } }
@@ -667,6 +668,23 @@ class TensorUtilTest(test.TestCase):
     self.assertFalse(tensor_util.ShapeEquals(t, [1, 4]))
     self.assertFalse(tensor_util.ShapeEquals(t, [4]))
 
+  def testMockArray(self):
+
+    class MockArray(object):
+
+      def __init__(self, array):
+        self.array = array
+
+      def __array__(self, dtype=None):
+        return np.asarray(self.array, dtype)
+
+    with self.test_session() as sess:
+      ma = MockArray(np.array([10, 20, 30]))
+      t = ops.convert_to_tensor(ma)
+      a = sess.run(t)
+      self.assertEquals(np.int64, a.dtype)
+      self.assertAllClose(np.array([10, 20, 30], dtype=np.int64), a)
+
 
 class ConstantValueTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
index ca38f1af9f..fa1553a3f6 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -234,7 +235,8 @@ class PoolingTest(test.TestCase):
     x = np.arange(1, total_size + 1, dtype=np.float32)
     with self.test_session(use_gpu=use_gpu):
       input_tensor = constant_op.constant(x, shape=input_sizes, name="input")
-      err_margin = 1e-3
+      err_g_margin = 1e-3
+      err_gg_margin = 1.5e-2
       if pool_func == nn_ops.avg_pool3d:
         func_name = "avg_pool3d"
         x_init_value = None
@@ -259,19 +261,27 @@ class PoolingTest(test.TestCase):
           padding=padding,
           data_format=data_format,
           name=func_name)
+      t_g = gradients_impl.gradients(t**2, input_tensor)[0]
 
-      if data_format == "NCDHW":
-        t = test_util.NCHWToNHWC(t)
-
-      err = gradient_checker.compute_gradient_error(
+      err_g = gradient_checker.compute_gradient_error(
           input_tensor,
           input_sizes,
           t,
           output_sizes,
           x_init_value=x_init_value,
           delta=1e-2)
-    print("%s gradient error = " % func_name, err)
-    self.assertLess(err, err_margin)
+      err_gg = gradient_checker.compute_gradient_error(
+          input_tensor,
+          input_sizes,
+          t_g,
+          input_sizes,
+          x_init_value=x_init_value,
+          delta=1e-2)
+
+    print("%s gradient error = " % func_name, err_g)
+    self.assertLess(err_g, err_g_margin)
+    print("%s second-order gradient error = " % func_name, err_gg)
+    self.assertLess(err_gg, err_gg_margin)
 
   def _ConstructAndTestGradient(self,
                                 pool_func,
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index e657faa131..c3e2a640b7 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -522,7 +523,7 @@ class PoolingTest(test.TestCase):
   # The following are tests that verify that the CPU and GPU implementations
   # produce the same resuts.
   def _CompareMaxPoolingFwd(self, input_shape, ksize, strides, padding):
-    for dtype in np.float32, np.float16:
+    for dtype in np.float64, np.float32, np.float16:
       tensor_input = np.random.rand(*input_shape).astype(dtype)
       with self.test_session(use_gpu=True):
         t = constant_op.constant(tensor_input, shape=input_shape)
@@ -536,7 +537,7 @@ class PoolingTest(test.TestCase):
 
   def _CompareMaxPoolingBk(self, input_shape, output_shape, ksize, strides,
                            padding):
-    for dtype in np.float32, np.float16:
+    for dtype in np.float64, np.float32, np.float16:
       # Generate numbers in a narrow range, so that there are many duplicates
       # in the input.
       tensor_input = np.random.random_integers(0, 3, input_shape).astype(dtype)
@@ -559,12 +560,39 @@ class PoolingTest(test.TestCase):
                                            padding)
         cpu_val = out_op.eval()
         self.assertShapeEqual(cpu_val, out_op)
-      if dtype == np.float16:
-        # The CPU version accumulates its gradient on fp16, so it's less
-        # accurate than the GPU version that does the accumulation on fp32
-        self.assertAllClose(cpu_val, gpu_val, rtol=0.01, atol=0.01)
-      else:
-        self.assertAllClose(cpu_val, gpu_val)
+      # The CPU version accumulates its gradient on fp16, so it's less
+      # accurate than the GPU version that does the accumulation on fp32
+      self.assertAllCloseAccordingToType(
+          cpu_val, gpu_val, half_rtol=0.01, half_atol=0.01)
+
+  def _CompareMaxPoolingGradBk(self, input_shape, output_shape, ksize, strides,
+                               padding):
+    for dtype in np.float64, np.float32, np.float16:
+      # Generate numbers in a narrow range, so that there are many duplicates
+      # in the input.
+      tensor_input = np.random.random_integers(0, 3, input_shape).astype(dtype)
+      with self.test_session(use_gpu=True):
+        t = constant_op.constant(tensor_input, shape=input_shape)
+        _, argmax_op = nn_ops.max_pool_with_argmax(t, ksize, strides, padding)
+        argmax = argmax_op.eval()
+        grad_in = constant_op.constant(tensor_input, shape=input_shape)
+        out_op = gen_nn_ops._max_pool_grad_grad_with_argmax(
+            t, grad_in, argmax, ksize, strides, padding)
+        gpu_val = out_op.eval()
+        self.assertShapeEqual(gpu_val, out_op)
+      with self.test_session(use_gpu=False):
+        t = constant_op.constant(tensor_input, shape=input_shape)
+        out_op = nn_ops.max_pool(t, ksize, strides, padding)
+        orig_out = out_op.eval()
+        grad_in = constant_op.constant(tensor_input, shape=input_shape)
+        out_op = gen_nn_ops._max_pool_grad_grad(t, orig_out, grad_in, ksize,
+                                                strides, padding)
+        cpu_val = out_op.eval()
+        self.assertShapeEqual(cpu_val, out_op)
+      # The CPU version accumulates its gradient on fp16, so it's less
+      # accurate than the GPU version that does the accumulation on fp32
+      self.assertAllCloseAccordingToType(
+          cpu_val, gpu_val, half_rtol=0.01, half_atol=0.01)
 
   def testMaxPoolingWithArgmax(self):
     # MaxPoolWithArgMax is implemented only on CUDA.
@@ -608,6 +636,28 @@ class PoolingTest(test.TestCase):
       self.assertAllClose(out,
                           [11.0, 12.0, 0.0, 13.0, 0.0, 14.0, 0.0, 0.0, 0.0])
 
+  def testMaxPoolingGradGradWithArgmax(self):
+    # MaxPoolWithArgMax is implemented only on CUDA.
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    orig_input = [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
+    tensor_input = [11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0]
+    tensor_argmax = list(np.array([0, 1, 3, 5], dtype=np.int64))
+    with self.test_session(use_gpu=True):
+      orig_in = constant_op.constant(orig_input, shape=[1, 3, 3, 1])
+      t = constant_op.constant(tensor_input, shape=[1, 3, 3, 1])
+      argmax = constant_op.constant(
+          tensor_argmax, shape=[1, 2, 2, 1], dtype=dtypes.int64)
+      out_op = gen_nn_ops._max_pool_grad_grad_with_argmax(
+          orig_in,
+          t,
+          argmax,
+          ksize=[1, 2, 2, 1],
+          strides=[1, 1, 1, 1],
+          padding="VALID")
+      out = out_op.eval().flatten()
+      self.assertAllClose(out, [11.0, 12.0, 14.0, 16.0])
+
   def _ConstructAndTestGradient(self,
                                 pool_func,
                                 input_sizes,
@@ -648,14 +698,14 @@ class PoolingTest(test.TestCase):
       input_tensor = constant_op.constant(x, shape=input_sizes, name="input")
       if pool_func == nn_ops.avg_pool:
         func_name = "avg_pool"
-        err_margin = 1e-4
+        err_tolerance = 1e-4
       else:
         if x_init_value is None:
           x_init_value = np.asfarray(
               np.arange(1, total_size + 1),
               dtype=np.float32).reshape(input_sizes)
         func_name = "max_pool"
-        err_margin = 1e-3
+        err_tolerance = 1e-3
       if data_format == "NCHW":
         ksize = [1, 1, window_rows, window_rows]
         strides = [1, 1, row_stride, col_stride]
@@ -682,7 +732,84 @@ class PoolingTest(test.TestCase):
           x_init_value=x_init_value,
           delta=1e-2)
     print("%s gradient error = " % func_name, err)
-    self.assertLess(err, err_margin)
+    self.assertLess(err, err_tolerance)
+
+  def _ConstructAndTestSecondGradient(self,
+                                      pool_func,
+                                      input_sizes,
+                                      output_sizes,
+                                      window_rows,
+                                      window_cols,
+                                      row_stride,
+                                      col_stride,
+                                      padding,
+                                      data_format,
+                                      use_gpu,
+                                      x_init_value=None):
+    """Verifies the second-order gradients of the pooling function.
+
+    Args:
+      pool_func: Function to be called, co.MaxPool, co.AvgPool,
+        or the Lua version.
+      input_sizes: Input tensor dimensions.
+      output_sizes: Output tensor dimensions.
+      window_rows: kernel size in row dim
+      window_cols: kernel size in col dim
+      row_stride: Row Stride.
+      col_stride: Col Stride.
+      padding: Padding type.
+      data_format: Data format.
+      use_gpu: whether we are running on GPU
+      x_init_value: Values to be passed to the gradient checker.
+    """
+    assert input_sizes[0] == output_sizes[0]
+    assert input_sizes[3] == output_sizes[3]
+    total_size = 1
+    for s in input_sizes:
+      total_size *= s
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x = [f * 1.0 for f in range(1, total_size + 1)]
+    with self.test_session(use_gpu=use_gpu):
+      input_tensor = constant_op.constant(x, shape=input_sizes, name="input")
+      if pool_func == nn_ops.avg_pool:
+        func_name = "avg_pool"
+        err_tolerance = 1e-3
+      else:
+        if x_init_value is None:
+          x_init_value = np.asfarray(
+              np.arange(1, total_size + 1),
+              dtype=np.float32).reshape(input_sizes)
+        func_name = "max_pool"
+        err_tolerance = 1e-2
+      if data_format == "NCHW":
+        ksize = [1, 1, window_rows, window_rows]
+        strides = [1, 1, row_stride, col_stride]
+        t = test_util.NHWCToNCHW(input_tensor)
+      else:
+        ksize = [1, window_rows, window_rows, 1]
+        strides = [1, row_stride, col_stride, 1]
+        t = input_tensor
+      t = pool_func(
+          t,
+          ksize=ksize,
+          strides=strides,
+          padding=padding,
+          data_format=data_format,
+          name=func_name)
+      if data_format == "NCHW":
+        t = test_util.NHWCToNCHW(t)
+
+      t_g = gradients_impl.gradients(t**2, input_tensor)[0]
+      err = gradient_checker.compute_gradient_error(
+          input_tensor,
+          input_sizes,
+          t_g,
+          input_sizes,
+          x_init_value=x_init_value,
+          delta=1e-2)
+    print("%s second-order gradient error = " % func_name, err)
+    self.assertLess(err, err_tolerance)
 
   def _testMaxPoolGradValidPadding1_1(self, data_format, use_gpu):
     self._ConstructAndTestGradient(
@@ -1051,6 +1178,144 @@ class PoolingTest(test.TestCase):
     self._testMaxPoolGradDirectWithNans2_1()
     self._testMaxPoolGradDirectWithNans2_2()
 
+  def _testMaxPoolGradGradValidPadding1_1(self, data_format, use_gpu):
+    self._ConstructAndTestSecondGradient(
+        nn_ops.max_pool,
+        input_sizes=[1, 3, 3, 1],
+        output_sizes=[1, 3, 3, 1],
+        window_rows=1,
+        window_cols=1,
+        row_stride=1,
+        col_stride=1,
+        padding="VALID",
+        data_format=data_format,
+        use_gpu=use_gpu)
+
+  def _testMaxPoolGradGradValidPadding2_1_6(self, data_format, use_gpu):
+    self._ConstructAndTestSecondGradient(
+        nn_ops.max_pool,
+        input_sizes=[2, 6, 6, 3],
+        output_sizes=[2, 5, 5, 3],
+        window_rows=2,
+        window_cols=2,
+        row_stride=1,
+        col_stride=1,
+        padding="VALID",
+        data_format=data_format,
+        use_gpu=use_gpu)
+
+  def _testMaxPoolGradGradValidPadding2_1_7(self, data_format, use_gpu):
+    self._ConstructAndTestSecondGradient(
+        nn_ops.max_pool,
+        input_sizes=[2, 7, 7, 3],
+        output_sizes=[2, 6, 6, 3],
+        window_rows=2,
+        window_cols=2,
+        row_stride=1,
+        col_stride=1,
+        padding="VALID",
+        data_format=data_format,
+        use_gpu=use_gpu)
+
+  def _testMaxPoolGradGradValidPadding2_2(self, data_format, use_gpu):
+    self._ConstructAndTestSecondGradient(
+        nn_ops.max_pool,
+        input_sizes=[2, 2, 2, 3],
+        output_sizes=[2, 1, 1, 3],
+        window_rows=2,
+        window_cols=2,
+        row_stride=2,
+        col_stride=2,
+        padding="VALID",
+        data_format=data_format,
+        use_gpu=use_gpu)
+
+  def _testMaxPoolGradGradSamePadding1_1(self, data_format, use_gpu):
+    self._ConstructAndTestSecondGradient(
+        nn_ops.max_pool,
+        input_sizes=[2, 2, 4, 3],
+        output_sizes=[2, 2, 4, 3],
+        window_rows=1,
+        window_cols=1,
+        row_stride=1,
+        col_stride=1,
+        padding="SAME",
+        data_format=data_format,
+        use_gpu=use_gpu)
+
+  def _testMaxPoolGradGradSamePadding2_1(self, data_format, use_gpu):
+    self._ConstructAndTestSecondGradient(
+        nn_ops.max_pool,
+        input_sizes=[2, 2, 4, 3],
+        output_sizes=[2, 2, 4, 3],
+        window_rows=2,
+        window_cols=2,
+        row_stride=1,
+        col_stride=1,
+        padding="SAME",
+        data_format=data_format,
+        use_gpu=use_gpu)
+
+  def _testMaxPoolGradGradSamePadding2_2(self, data_format, use_gpu):
+    self._ConstructAndTestSecondGradient(
+        nn_ops.max_pool,
+        input_sizes=[2, 2, 4, 3],
+        output_sizes=[2, 1, 2, 3],
+        window_rows=2,
+        window_cols=2,
+        row_stride=2,
+        col_stride=2,
+        padding="SAME",
+        data_format=data_format,
+        use_gpu=use_gpu)
+
+  def _testMaxPoolGradGradSamePadding3_1(self, data_format, use_gpu):
+    self._ConstructAndTestSecondGradient(
+        nn_ops.max_pool,
+        input_sizes=[1, 7, 7, 1],
+        output_sizes=[1, 7, 7, 1],
+        window_rows=3,
+        window_cols=3,
+        row_stride=1,
+        col_stride=1,
+        padding="SAME",
+        data_format=data_format,
+        use_gpu=use_gpu)
+
+  def testMaxPoolGradGrad(self):
+    for (data_format, use_gpu) in GetTestConfigs():
+      self._testMaxPoolGradGradValidPadding1_1(data_format, use_gpu)
+      self._testMaxPoolGradGradValidPadding2_1_6(data_format, use_gpu)
+      self._testMaxPoolGradGradValidPadding2_1_7(data_format, use_gpu)
+      self._testMaxPoolGradGradValidPadding2_2(data_format, use_gpu)
+      self._testMaxPoolGradGradSamePadding1_1(data_format, use_gpu)
+      self._testMaxPoolGradGradSamePadding2_1(data_format, use_gpu)
+      self._testMaxPoolGradGradSamePadding2_2(data_format, use_gpu)
+      self._testMaxPoolGradGradSamePadding3_1(data_format, use_gpu)
+
+  def _MaxPoolGradGrad(self, orig_input, orig_output, grad, window_rows,
+                       window_cols, row_stride, col_stride, padding):
+    """Max Pooling Second-Order Gradient.
+
+    Args:
+      orig_input: A float Tensor. The original input tensor.
+      orig_output: A float Tensor. The original output tensor.
+      grad: A float Tensor.
+        The 4D (batch x out_rows x out_cols x depth) output backprop.
+      window_rows: integer. Kernel size along rows dimension.
+      window_cols: integer. Kernel size along cols dimension.
+      row_stride: integer. Stride along rows dimension
+      col_stride: integer. Stride along cols dimension
+      padding: PoolingOpDef.Padding.  Padding type.
+
+    Returns:
+      A Tensor.
+    """
+    return gen_nn_ops._max_pool_grad_grad(orig_input, orig_output, grad,
+                                          [1, window_rows, window_cols,
+                                           1], [1, row_stride, col_stride,
+                                                1], padding)
+
   def testAvgPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self._testAvgPoolGradValidPadding1_1(data_format, use_gpu)
@@ -1239,6 +1504,19 @@ def GetMaxPoolGradTest(input_size, filter_size, output_size, strides, padding):
   return Test
 
 
+def GetMaxPoolGradGradTest(input_size, filter_size, output_size, strides,
+                           padding):
+
+  def Test(self):
+    # MaxPoolWithArgMax is implemented only on CUDA.
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    self._CompareMaxPoolingGradBk(input_size, output_size, filter_size, strides,
+                                  padding)
+
+  return Test
+
+
 if __name__ == "__main__":
   for (name_, input_size_, filter_size_, output_size_, stride_,
        padding_) in GetShrunkInceptionMaxPoolShapes():
@@ -1247,4 +1525,7 @@ if __name__ == "__main__":
     setattr(PoolingTest, "testMaxPoolGrad_" + name_,
             GetMaxPoolGradTest(input_size_, filter_size_, output_size_, stride_,
                                padding_))
+    setattr(PoolingTest, "testMaxPoolGradGrad_" + name_,
+            GetMaxPoolGradGradTest(input_size_, filter_size_, output_size_,
+                                   stride_, padding_))
   test.main()
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index 19f6a98547..8659382834 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -162,7 +162,7 @@ class BatchNormalization(base._Layer):  # pylint: disable=protected-access
     broadcast_shape[self.axis] = input_shape[self.axis].value
 
     # Determines whether broadcasting is needed.
-    needs_broadcasting = (sorted(reduction_axes) != range(ndim)[:-1])
+    needs_broadcasting = (sorted(reduction_axes) != list(range(ndim))[:-1])
 
     # Determine a boolean value for `training`: could be True, False, or None.
     training_value = utils.constant_value(training)
diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt
index fbfd7bb7d6..4981cb6a2e 100644
--- a/tensorflow/python/ops/hidden_ops.txt
+++ b/tensorflow/python/ops/hidden_ops.txt
@@ -252,6 +252,7 @@ TruncateMod
 
 # nn_ops
 AvgPoolGrad  # "*Grad" accessible through nn_grad instead of nn_ops.
+AvgPool3DGrad
 BatchNormWithGlobalNormalization
 BatchNormWithGlobalNormalizationGrad
 FusedBatchNorm
@@ -260,6 +261,10 @@ SparseSoftmaxCrossEntropyWithLogits
 LRNGrad
 MaxPoolGrad
 MaxPoolGradWithArgmax
+MaxPoolGradGrad
+MaxPoolGradGradWithArgmax
+MaxPool3DGrad
+MaxPool3DGradGrad
 ReluGrad
 Relu6Grad
 EluGrad
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index d6e7f5f58f..e2fd25675e 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -240,7 +240,7 @@ def svd(tensor, full_matrices=False, compute_uv=True, name=None):
   # a is a tensor.
   # s is a tensor of singular values.
   # u is a tensor of left singular vectors.
-  #v is a tensor of right singular vectors.
+  # v is a tensor of right singular vectors.
   s, u, v = svd(a)
   s = svd(a, compute_uv=False)
   ```
@@ -258,10 +258,10 @@ def svd(tensor, full_matrices=False, compute_uv=True, name=None):
 
   Returns:
     s: Singular values. Shape is `[..., P]`.
-    u: Right singular vectors. If `full_matrices` is `False` (default) then
+    u: Left singular vectors. If `full_matrices` is `False` (default) then
       shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
       `[..., M, M]`. Not returned if `compute_uv` is `False`.
-    v: Left singular vectors. If `full_matrices` is `False` (default) then
+    v: Right singular vectors. If `full_matrices` is `False` (default) then
       shape is `[..., N, P]`. If `full_matrices` is `True` then shape is
       `[..., N, N]`. Not returned if `compute_uv` is `False`.
 
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index a01466e1ae..f5e9550b97 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -121,7 +121,7 @@ def _Conv3DBackpropFilterGrad(op, grad):
 
 @ops.RegisterGradient("AvgPool3D")
 def _AvgPool3DGrad(op, grad):
-  return nn_ops.avg_pool3d_grad(
+  return gen_nn_ops._avg_pool3d_grad(
       array_ops.shape(op.inputs[0]),
       grad,
       ksize=op.get_attr("ksize"),
@@ -130,15 +130,58 @@ def _AvgPool3DGrad(op, grad):
       data_format=op.get_attr("data_format"))
 
 
+@ops.RegisterGradient("AvgPool3DGrad")
+def _AvgPool3DGradGrad(op, grad):
+  return (array_ops.stop_gradient(op.inputs[0]), gen_nn_ops.avg_pool3d(
+      grad,
+      op.get_attr("ksize"),
+      op.get_attr("strides"),
+      op.get_attr("padding"),
+      data_format=op.get_attr("data_format")))
+
+
 @ops.RegisterGradient("MaxPool3D")
 def _MaxPool3DGrad(op, grad):
-  return nn_ops.max_pool3d_grad(op.inputs[0],
-                                op.outputs[0],
-                                grad,
-                                ksize=op.get_attr("ksize"),
-                                strides=op.get_attr("strides"),
-                                padding=op.get_attr("padding"),
-                                data_format=op.get_attr("data_format"))
+  return gen_nn_ops._max_pool3d_grad(
+      op.inputs[0],
+      op.outputs[0],
+      grad,
+      ksize=op.get_attr("ksize"),
+      strides=op.get_attr("strides"),
+      padding=op.get_attr("padding"),
+      data_format=op.get_attr("data_format"))
+
+
+@ops.RegisterGradient("MaxPool3DGrad")
+def _MaxPool3DGradGrad(op, grad):
+  return (array_ops.zeros(
+      shape=array_ops.shape(op.inputs[0]),
+      dtype=op.inputs[0].dtype), array_ops.zeros(
+          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+          gen_nn_ops._max_pool3d_grad_grad(
+              op.inputs[0],
+              op.inputs[1],
+              grad,
+              op.get_attr("ksize"),
+              op.get_attr("strides"),
+              padding=op.get_attr("padding"),
+              data_format=op.get_attr("data_format")))
+
+
+@ops.RegisterGradient("MaxPool3DGradGrad")
+def _MaxPool3DGradGradGrad(op, grad):
+  return (array_ops.zeros(
+      shape=array_ops.shape(op.inputs[0]),
+      dtype=op.inputs[0].dtype), array_ops.zeros(
+          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+          gen_nn_ops._max_pool3d_grad(
+              op.inputs[0],
+              op.inputs[1],
+              grad,
+              op.get_attr("ksize"),
+              op.get_attr("strides"),
+              padding=op.get_attr("padding"),
+              data_format=op.get_attr("data_format")))
 
 
 @ops.RegisterGradient("Softmax")
@@ -214,6 +257,7 @@ def _BiasAddGrad(op, received_grad):
   return (received_grad, gen_nn_ops.bias_add_grad(out_backprop=received_grad,
                                                   data_format=data_format))
 
+
 @ops.RegisterGradient("BiasAddGrad")
 def _BiasAddGradGrad(op, received_grad):
   """Gradient for the BiasAddGrad op.
@@ -438,6 +482,16 @@ def _AvgPoolGrad(op, grad):
       data_format=op.get_attr("data_format"))
 
 
+@ops.RegisterGradient("AvgPoolGrad")
+def _AvgPoolGradGrad(op, grad):
+  return (array_ops.stop_gradient(op.inputs[0]), gen_nn_ops._avg_pool(
+      grad,
+      op.get_attr("ksize"),
+      op.get_attr("strides"),
+      op.get_attr("padding"),
+      data_format=op.get_attr("data_format")))
+
+
 @ops.RegisterGradient("MaxPool")
 def _MaxPoolGrad(op, grad):
   return gen_nn_ops._max_pool_grad(op.inputs[0],
@@ -449,6 +503,38 @@ def _MaxPoolGrad(op, grad):
                                    data_format=op.get_attr("data_format"))
 
 
+@ops.RegisterGradient("MaxPoolGrad")
+def _MaxPoolGradGrad(op, grad):
+  return (array_ops.zeros(
+      shape=array_ops.shape(op.inputs[0]),
+      dtype=op.inputs[0].dtype), array_ops.zeros(
+          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+          gen_nn_ops._max_pool_grad_grad(
+              op.inputs[0],
+              op.inputs[1],
+              grad,
+              op.get_attr("ksize"),
+              op.get_attr("strides"),
+              padding=op.get_attr("padding"),
+              data_format=op.get_attr("data_format")))
+
+
+@ops.RegisterGradient("MaxPoolGradGrad")
+def _MaxPoolGradGradGrad(op, grad):
+  return (array_ops.zeros(
+      shape=array_ops.shape(op.inputs[0]),
+      dtype=op.inputs[0].dtype), array_ops.zeros(
+          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+          gen_nn_ops._max_pool_grad(
+              op.inputs[0],
+              op.inputs[1],
+              grad,
+              op.get_attr("ksize"),
+              op.get_attr("strides"),
+              padding=op.get_attr("padding"),
+              data_format=op.get_attr("data_format")))
+
+
 @ops.RegisterGradient("FractionalMaxPool")
 def _FractionalMaxPoolGrad(op, grad_0, unused_grad_1, unused_grad_2):
   """Returns gradient for FractionalMaxPool.
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 5e6767e30f..4b45b92342 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -39,6 +39,8 @@ from tensorflow.python.ops.gen_nn_ops import *
 # Aliases for some automatically-generated names.
 local_response_normalization = gen_nn_ops.lrn
 
+# pylint: disable=protected-access
+
 
 def _non_atrous_convolution(input, filter, padding, data_format=None,  # pylint: disable=redefined-builtin
                             strides=None, name=None):
@@ -1698,7 +1700,7 @@ def sparse_softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=
   a probability distribution for each entry, see
   `softmax_cross_entropy_with_logits`.
 
-  **WARNING:** This op expects unscaled logits, since it performs a softmax
+  **WARNING:** This op expects unscaled logits, since it performs a `softmax`
   on `logits` internally for efficiency.  Do not call this op with the
   output of `softmax`, as it will produce incorrect results.
 
diff --git a/tensorflow/python/platform/control_imports.py b/tensorflow/python/platform/control_imports.py
new file mode 100644
index 0000000000..b8e8e78ef3
--- /dev/null
+++ b/tensorflow/python/platform/control_imports.py
@@ -0,0 +1,27 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Switch between Google or open source dependencies."""
+# Switch between Google and OSS dependencies
+USE_OSS = True
+
+# Per-dependency switches determining whether each dependency is ready
+# to be replaced by its OSS equivalence.
+# TODO(danmane,mrry,opensource): Flip these switches, then remove them
+OSS_APP = True
+OSS_FLAGS = True
+OSS_GFILE = True
+OSS_GOOGLETEST = True
+OSS_LOGGING = True
+OSS_PARAMETERIZED = True
diff --git a/tensorflow/python/platform/googletest.py b/tensorflow/python/platform/googletest.py
index 5227d2e35c..1e74b1512b 100644
--- a/tensorflow/python/platform/googletest.py
+++ b/tensorflow/python/platform/googletest.py
@@ -99,6 +99,7 @@ def main(argv=None):  # pylint: disable=function-redefined
 
 def GetTempDir():
   """Return a temporary directory for tests to use."""
+  global _googletest_temp_dir
   if not _googletest_temp_dir:
     first_frame = inspect.stack()[-1][0]
     temp_dir = os.path.join(
@@ -112,7 +113,6 @@ def GetTempDir():
         logging.error('Error removing %s: %s', dirname, e)
 
     atexit.register(delete_temp_dir)
-    global _googletest_temp_dir
     _googletest_temp_dir = temp_dir
 
   return _googletest_temp_dir
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index 7b4fabad95..d075a04ca2 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -57,7 +57,7 @@ class SavedModelBuilder(object):
   Typical usage for the `SavedModelBuilder`:
   ```python
   ...
-  builder = saved_model_builder.SavedModelBuilder(export_dir)
+  builder = saved_model.builder.SavedModelBuilder(export_dir)
 
   with tf.Session(graph=tf.Graph()) as sess:
     ...
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 965061053f..6c06a73943 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -3084,6 +3084,41 @@ bool CudnnSupport::DoActivate(Stream* stream,
 bool CudnnSupport::DoPoolForward(
     Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
     const dnn::BatchDescriptor& input_dimensions,
+    const DeviceMemory<double>& input_data,
+    const dnn::BatchDescriptor& output_dimensions,
+    DeviceMemory<double>* output_data) {
+  mutex_lock lock{dnn_handle_mutex_};
+  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
+                                     AsCUDAStreamValue(stream));
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
+    return false;
+  }
+
+  // Alpha is the scaling factor for input.
+  double alpha = 1.0;
+  // Beta is the scaling factor for output.
+  double beta = 0.0;
+
+  ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_DOUBLE};
+  ScopedTensorDescriptor dest_desc{parent_, output_dimensions,
+                                   CUDNN_DATA_DOUBLE};
+  ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
+  status = wrap::cudnnPoolingForward(
+      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(),
+      output_data->opaque());
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to enqueue forward pooling on stream: "
+               << ToString(status);
+    return false;
+  }
+  return true;
+}
+
+bool CudnnSupport::DoPoolForward(
+    Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
+    const dnn::BatchDescriptor& input_dimensions,
     const DeviceMemory<float>& input_data,
     const dnn::BatchDescriptor& output_dimensions,
     DeviceMemory<float>* output_data) {
@@ -3153,6 +3188,44 @@ bool CudnnSupport::DoPoolForward(
 bool CudnnSupport::DoPoolBackward(
     Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
     const dnn::BatchDescriptor& input_dimensions,
+    const DeviceMemory<double>& input_data,
+    const dnn::BatchDescriptor& output_dimensions,
+    const DeviceMemory<double>& output_data,
+    const DeviceMemory<double>& input_diff_data,
+    DeviceMemory<double>* output_diff_data) {
+  mutex_lock lock{dnn_handle_mutex_};
+  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
+                                     AsCUDAStreamValue(stream));
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
+    return false;
+  }
+
+  // Alpha is the scaling factor for input.
+  double alpha = 1.0;
+  // Beta is the scaling factor for output.
+  double beta = 0.0;
+
+  ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_DOUBLE};
+  ScopedTensorDescriptor dest_desc{parent_, output_dimensions,
+                                   CUDNN_DATA_DOUBLE};
+  ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
+  status = wrap::cudnnPoolingBackward(
+      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      dest_desc.handle(), output_data.opaque(), dest_desc.handle(),
+      input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta,
+      src_desc.handle(), output_diff_data->opaque());
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to enqueue backward pooling on stream: "
+               << ToString(status);
+    return false;
+  }
+  return true;
+}
+
+bool CudnnSupport::DoPoolBackward(
+    Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
+    const dnn::BatchDescriptor& input_dimensions,
     const DeviceMemory<float>& input_data,
     const dnn::BatchDescriptor& output_dimensions,
     const DeviceMemory<float>& output_data,
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index cfc7e29574..b280b73c70 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -308,6 +308,13 @@ class CudnnSupport : public dnn::DnnSupport {
   bool DoPoolForward(Stream* stream,
                      const dnn::PoolingDescriptor& pooling_dimensions,
                      const dnn::BatchDescriptor& input_dimensions,
+                     const DeviceMemory<double>& input_data,
+                     const dnn::BatchDescriptor& output_dimensions,
+                     DeviceMemory<double>* output_data) override;
+
+  bool DoPoolForward(Stream* stream,
+                     const dnn::PoolingDescriptor& pooling_dimensions,
+                     const dnn::BatchDescriptor& input_dimensions,
                      const DeviceMemory<float>& input_data,
                      const dnn::BatchDescriptor& output_dimensions,
                      DeviceMemory<float>* output_data) override;
@@ -322,6 +329,15 @@ class CudnnSupport : public dnn::DnnSupport {
   bool DoPoolBackward(Stream* stream,
                       const dnn::PoolingDescriptor& pooling_dimensions,
                       const dnn::BatchDescriptor& input_dimensions,
+                      const DeviceMemory<double>& input_data,
+                      const dnn::BatchDescriptor& output_dimensions,
+                      const DeviceMemory<double>& output_data,
+                      const DeviceMemory<double>& input_diff_data,
+                      DeviceMemory<double>* output_diff_data) override;
+
+  bool DoPoolBackward(Stream* stream,
+                      const dnn::PoolingDescriptor& pooling_dimensions,
+                      const dnn::BatchDescriptor& input_dimensions,
                       const DeviceMemory<float>& input_data,
                       const dnn::BatchDescriptor& output_dimensions,
                       const DeviceMemory<float>& output_data,
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index d6b3f51705..c5805064f3 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -1283,19 +1283,47 @@ class DnnSupport {
   virtual bool DoPoolForward(Stream* stream,
                              const dnn::PoolingDescriptor& pooling_dimensions,
                              const dnn::BatchDescriptor& input_dimensions,
+                             const DeviceMemory<double>& input_data,
+                             const dnn::BatchDescriptor& output_dimensions,
+                             DeviceMemory<double>* output_data) {
+    LOG(FATAL) << "DoPoolForward not implemented for double.";
+    return false;
+  }
+
+  virtual bool DoPoolForward(Stream* stream,
+                             const dnn::PoolingDescriptor& pooling_dimensions,
+                             const dnn::BatchDescriptor& input_dimensions,
                              const DeviceMemory<Eigen::half>& input_data,
                              const dnn::BatchDescriptor& output_dimensions,
-                             DeviceMemory<Eigen::half>* output_data) = 0;
+                             DeviceMemory<Eigen::half>* output_data) {
+    LOG(FATAL) << "DoPoolForward not implemented for float16.";
+    return false;
+  }
 
   // Performs differentiation of the pooling operation.
   virtual bool DoPoolBackward(Stream* stream,
                               const dnn::PoolingDescriptor& pooling_dimensions,
                               const dnn::BatchDescriptor& input_dimensions,
+                              const DeviceMemory<double>& input_data,
+                              const dnn::BatchDescriptor& output_dimensions,
+                              const DeviceMemory<double>& output_data,
+                              const DeviceMemory<double>& input_diff_data,
+                              DeviceMemory<double>* output_diff_data) {
+    LOG(FATAL) << "DoPoolBackward not implemented.";
+    return false;
+  }
+
+  virtual bool DoPoolBackward(Stream* stream,
+                              const dnn::PoolingDescriptor& pooling_dimensions,
+                              const dnn::BatchDescriptor& input_dimensions,
                               const DeviceMemory<float>& input_data,
                               const dnn::BatchDescriptor& output_dimensions,
                               const DeviceMemory<float>& output_data,
                               const DeviceMemory<float>& input_diff_data,
-                              DeviceMemory<float>* output_diff_data) = 0;
+                              DeviceMemory<float>* output_diff_data) {
+    LOG(FATAL) << "DoPoolBackward not implemented.";
+    return false;
+  }
 
   virtual bool DoPoolBackward(Stream* stream,
                               const dnn::PoolingDescriptor& pooling_dimensions,
@@ -1304,7 +1332,10 @@ class DnnSupport {
                               const dnn::BatchDescriptor& output_dimensions,
                               const DeviceMemory<Eigen::half>& output_data,
                               const DeviceMemory<Eigen::half>& input_diff_data,
-                              DeviceMemory<Eigen::half>* output_diff_data) = 0;
+                              DeviceMemory<Eigen::half>* output_diff_data) {
+    LOG(FATAL) << "DoPoolBackward not implemented.";
+    return false;
+  }
 
   // Applies local response normalization to the values from
   // input_data and writes the result to output_data. See comments on
@@ -1884,4 +1915,3 @@ class DnnSupport {
 }  // namespace perftools
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_DNN_H_
-
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 76cbf0b1b6..a393b07703 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -966,6 +966,30 @@ Stream &Stream::ThenBiasAdd(const DeviceMemory<float> &input_data,
 Stream &Stream::ThenPoolForward(
     const dnn::PoolingDescriptor &pooling_dimensions,
     const dnn::BatchDescriptor &input_dimensions,
+    const DeviceMemory<double> &input_data,
+    const dnn::BatchDescriptor &output_dimensions,
+    DeviceMemory<double> *output_data) {
+  VLOG_CALL(PARAM(pooling_dimensions), PARAM(input_dimensions),
+            PARAM(input_data), PARAM(output_dimensions), PARAM(output_data));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
+                                    input_data, output_dimensions,
+                                    output_data));
+    } else {
+      SetError();
+      LOG(WARNING)
+          << "attempting to perform DNN operation using StreamExecutor "
+             "without DNN support";
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenPoolForward(
+    const dnn::PoolingDescriptor &pooling_dimensions,
+    const dnn::BatchDescriptor &input_dimensions,
     const DeviceMemory<float> &input_data,
     const dnn::BatchDescriptor &output_dimensions,
     DeviceMemory<float> *output_data) {
@@ -1008,6 +1032,33 @@ Stream &Stream::ThenPoolForward(
 Stream &Stream::ThenPoolBackward(
     const dnn::PoolingDescriptor &pooling_dimensions,
     const dnn::BatchDescriptor &input_dimensions,
+    const DeviceMemory<double> &input_data,
+    const dnn::BatchDescriptor &output_dimensions,
+    const DeviceMemory<double> &output_data,
+    const DeviceMemory<double> &input_diff_data,
+    DeviceMemory<double> *output_diff_data) {
+  VLOG_CALL(PARAM(pooling_dimensions), PARAM(input_dimensions),
+            PARAM(input_data), PARAM(output_dimensions), PARAM(output_data),
+            PARAM(input_diff_data), PARAM(output_diff_data));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoPoolBackward(this, pooling_dimensions, input_dimensions,
+                                     input_data, output_dimensions, output_data,
+                                     input_diff_data, output_diff_data));
+    } else {
+      SetError();
+      LOG(WARNING)
+          << "attempting to perform DNN operation using StreamExecutor "
+             "without DNN support";
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenPoolBackward(
+    const dnn::PoolingDescriptor &pooling_dimensions,
+    const dnn::BatchDescriptor &input_dimensions,
     const DeviceMemory<float> &input_data,
     const dnn::BatchDescriptor &output_dimensions,
     const DeviceMemory<float> &output_data,
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index f22fba1d74..5b46b86f54 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -467,6 +467,12 @@ class Stream {
 
   Stream &ThenPoolForward(const dnn::PoolingDescriptor &pooling_dimensions,
                           const dnn::BatchDescriptor &input_dimensions,
+                          const DeviceMemory<double> &input_data,
+                          const dnn::BatchDescriptor &output_dimensions,
+                          DeviceMemory<double> *output_data);
+
+  Stream &ThenPoolForward(const dnn::PoolingDescriptor &pooling_dimensions,
+                          const dnn::BatchDescriptor &input_dimensions,
                           const DeviceMemory<float> &input_data,
                           const dnn::BatchDescriptor &output_dimensions,
                           DeviceMemory<float> *output_data);
@@ -479,6 +485,14 @@ class Stream {
 
   Stream &ThenPoolBackward(const dnn::PoolingDescriptor &pooling_dimensions,
                            const dnn::BatchDescriptor &input_dimensions,
+                           const DeviceMemory<double> &input_data,
+                           const dnn::BatchDescriptor &output_dimensions,
+                           const DeviceMemory<double> &output_data,
+                           const DeviceMemory<double> &input_diff_data,
+                           DeviceMemory<double> *output_diff_data);
+
+  Stream &ThenPoolBackward(const dnn::PoolingDescriptor &pooling_dimensions,
+                           const dnn::BatchDescriptor &input_dimensions,
                            const DeviceMemory<float> &input_data,
                            const dnn::BatchDescriptor &output_dimensions,
                            const DeviceMemory<float> &output_data,
diff --git a/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.ts b/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.ts
index 3ca1d8da29..4bc6a8a837 100644
--- a/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.ts
+++ b/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.ts
@@ -453,35 +453,19 @@ module VZ {
     }
 
     private resmoothDataset(dataset: Plottable.Dataset) {
-      // When increasing the smoothing window, it smoothes a lot with the first
-      // few points and then starts to gradually smooth slower, so using an
-      // exponential function makes the slider more consistent. 1000^x has a
-      // range of [1, 1000], so subtracting 1 and dividing by 999 results in a
-      // range of [0, 1], which can be used as the percentage of the data, so
-      // that the kernel size can be specified as a percentage instead of a
-      // hardcoded number, what would be bad with multiple series.
-      let factor = (Math.pow(1000, this.smoothingWeight) - 1) / 999;
       let data = dataset.data();
-      let kernelRadius = Math.floor(data.length * factor / 2);
-
-      data.forEach((d, i) => {
-        let actualKernelRadius = Math.min(kernelRadius, i);
-        let start = i - actualKernelRadius;
-        let end = i + actualKernelRadius + 1;
-        if (end >= data.length) {
-          // In the beginning, it's OK for the smoothing window to be small,
-          // but this is not desirable towards the end. Rather than shrinking
-          // the window, or extrapolating data to fill the gap, we're simply
-          // not going to display the smoothed line towards the end.
-          d.smoothed = Infinity;
-        } else if (!_.isFinite(d.scalar)) {
-          // Only smooth finite numbers.
+      const smoothingWeight = this.smoothingWeight;
+      let last = data.length > 0 ? data[0].scalar : NaN;
+      data.forEach((d) => {
+        if (!_.isFinite(last)) {
           d.smoothed = d.scalar;
         } else {
-          d.smoothed = d3.mean(
-              data.slice(start, end).filter((d) => _.isFinite(d.scalar)),
-              (d) => d.scalar);
+          // 1st-order IIR low-pass filter to attenuate the higher-
+          // frequency components of the time-series.
+          d.smoothed =
+              last * smoothingWeight + (1 - smoothingWeight) * d.scalar;
         }
+        last = d.smoothed;
       });
     }
 
diff --git a/tensorflow/tensorboard/plugins/debugger/BUILD b/tensorflow/tensorboard/plugins/debugger/BUILD
index 86254dc3aa..38aa719b9b 100644
--- a/tensorflow/tensorboard/plugins/debugger/BUILD
+++ b/tensorflow/tensorboard/plugins/debugger/BUILD
@@ -30,6 +30,7 @@ py_test(
     srcs = ["debugger_plugin_test.py"],
     main = "debugger_plugin_test.py",
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
         ":debugger_plugin",
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index a5637de1aa..a8007b803d 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1,42 +1,45 @@
 # -*- Python -*-
 
+
 # Given a source file, generate a test name.
 # i.e. "common_runtime/direct_session_test.cc" becomes
 #      "common_runtime_direct_session_test"
 def src_to_test_name(src):
   return src.replace("/", "_").split(".")[0]
 
+
 # Return the options to use for a C++ library or binary build.
 # Uses the ":optmode" config_setting to pick the options.
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
     "tf_cuda_tests_tags",
     "tf_sycl_tests_tags",
-    "tf_additional_xla_deps_py",
-)
-load(
-    "@local_config_cuda//cuda:build_defs.bzl",
-    "if_cuda",
-    "cuda_default_copts"
-)
+    "tf_additional_xla_deps_py",)
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda", "cuda_default_copts")
 
 load(
     "//third_party/mkl:build_defs.bzl",
-    "if_mkl",
-)
+    "if_mkl",)
+
 
 # List of proto files for android builds
 def tf_android_core_proto_sources(core_proto_sources_relative):
-  return ["//tensorflow/core:" + p
-          for p in core_proto_sources_relative]
+  return [
+      "//tensorflow/core:" + p for p in core_proto_sources_relative
+  ]
+
 
 # Returns the list of pb.h and proto.h headers that are generated for
 # tf_android_core_proto_sources().
 def tf_android_core_proto_headers(core_proto_sources_relative):
-  return (["//tensorflow/core/" + p.replace(".proto", ".pb.h")
-          for p in core_proto_sources_relative] +
-         ["//tensorflow/core/" + p.replace(".proto", ".proto.h")
-          for p in core_proto_sources_relative])
+  return ([
+      "//tensorflow/core/" + p.replace(".proto", ".pb.h")
+      for p in core_proto_sources_relative
+  ] + [
+      "//tensorflow/core/" + p.replace(".proto", ".proto.h")
+      for p in core_proto_sources_relative
+  ])
+
 
 def if_android_x86(a):
   return select({
@@ -52,30 +55,35 @@ def if_android_arm(a):
       "//conditions:default": [],
   })
 
+
 def if_android_arm64(a):
   return select({
       "//tensorflow:android_arm64": a,
       "//conditions:default": [],
   })
 
+
 def if_not_android(a):
   return select({
       "//tensorflow:android": [],
       "//conditions:default": a,
   })
 
+
 def if_android(a):
   return select({
       "//tensorflow:android": a,
       "//conditions:default": [],
   })
 
+
 def if_ios(a):
   return select({
       "//tensorflow:ios": a,
       "//conditions:default": [],
   })
 
+
 def if_mobile(a):
   return select({
       "//tensorflow:android": a,
@@ -83,6 +91,7 @@ def if_mobile(a):
       "//conditions:default": [],
   })
 
+
 def if_not_mobile(a):
   return select({
       "//tensorflow:android": [],
@@ -90,12 +99,14 @@ def if_not_mobile(a):
       "//conditions:default": a,
   })
 
+
 def if_not_windows(a):
   return select({
       "//tensorflow:windows": [],
       "//conditions:default": a,
   })
 
+
 def if_x86(a):
   return select({
       "//tensorflow:linux_x86_64": a,
@@ -103,33 +114,34 @@ def if_x86(a):
       "//conditions:default": [],
   })
 
+
 # LINT.IfChange
 def tf_copts():
-  return (["-DEIGEN_AVOID_STL_ARRAY",
-           "-Iexternal/gemmlowp",
-           "-Wno-sign-compare",
-           "-fno-exceptions",] +
-          if_cuda(["-DGOOGLE_CUDA=1"]) +
-          if_mkl(["-DINTEL_MKL=1"]) +
-          if_android_arm(["-mfpu=neon"]) +
-          if_x86(["-msse3"]) +
-          select({
-              "//tensorflow:android": [
-                  "-std=c++11",
-                  "-DTF_LEAN_BINARY",
-                  "-O2",
-              ],
-              "//tensorflow:darwin": [],
-              "//tensorflow:windows": [
-                "/DLANG_CXX11",
-                "/D__VERSION__=\\\"MSVC\\\"",
-                "/DPLATFORM_WINDOWS",
-                "/DTF_COMPILE_LIBRARY",
-                "/DEIGEN_HAS_C99_MATH",
-                "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
-              ],
-              "//tensorflow:ios": ["-std=c++11"],
-              "//conditions:default": ["-pthread"]}))
+  return ([
+      "-DEIGEN_AVOID_STL_ARRAY",
+      "-Iexternal/gemmlowp",
+      "-Wno-sign-compare",
+      "-fno-exceptions",
+  ] + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1"]) + if_android_arm(
+      ["-mfpu=neon"]) + if_x86(["-msse3"]) + select({
+          "//tensorflow:android": [
+              "-std=c++11",
+              "-DTF_LEAN_BINARY",
+              "-O2",
+          ],
+          "//tensorflow:darwin": [],
+          "//tensorflow:windows": [
+              "/DLANG_CXX11",
+              "/D__VERSION__=\\\"MSVC\\\"",
+              "/DPLATFORM_WINDOWS",
+              "/DTF_COMPILE_LIBRARY",
+              "/DEIGEN_HAS_C99_MATH",
+              "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
+          ],
+          "//tensorflow:ios": ["-std=c++11"],
+          "//conditions:default": ["-pthread"]
+      }))
+
 
 def tf_opts_nortti_if_android():
   return if_android([
@@ -137,8 +149,11 @@ def tf_opts_nortti_if_android():
       "-DGOOGLE_PROTOBUF_NO_RTTI",
       "-DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER",
   ]) + if_android_x86(["-msse4.1"])
+
+
 # LINT.ThenChange(//tensorflow/contrib/android/cmake/CMakeLists.txt)
 
+
 # Given a list of "op_lib_names" (a list of files in the ops directory
 # without their .cc extensions), generate a library for that file.
 def tf_gen_op_libs(op_lib_names, deps=None):
@@ -147,15 +162,19 @@ def tf_gen_op_libs(op_lib_names, deps=None):
   if not deps:
     deps = []
   for n in op_lib_names:
-    native.cc_library(name=n + "_op_lib",
-                      copts=tf_copts(),
-                      srcs=["ops/" + n + ".cc"],
-                      deps=deps + ["//tensorflow/core:framework"],
-                      visibility=["//visibility:public"],
-                      alwayslink=1,
-                      linkstatic=1,)
-
-def tf_gen_op_wrapper_cc(name, out_ops_file, pkg="",
+    native.cc_library(
+        name=n + "_op_lib",
+        copts=tf_copts(),
+        srcs=["ops/" + n + ".cc"],
+        deps=deps + ["//tensorflow/core:framework"],
+        visibility=["//visibility:public"],
+        alwayslink=1,
+        linkstatic=1,)
+
+
+def tf_gen_op_wrapper_cc(name,
+                         out_ops_file,
+                         pkg="",
                          op_gen="//tensorflow/cc:cc_op_gen_main",
                          deps=None,
                          override_file=None,
@@ -165,12 +184,11 @@ def tf_gen_op_wrapper_cc(name, out_ops_file, pkg="",
   if deps == None:
     deps = [pkg + ":" + name + "_op_lib"]
   native.cc_binary(
-      name = tool,
-      copts = tf_copts(),
-      linkopts = ["-lm"],
-      linkstatic = 1,   # Faster to link this one-time-use binary dynamically
-      deps = [op_gen] + deps
-  )
+      name=tool,
+      copts=tf_copts(),
+      linkopts=["-lm"],
+      linkstatic=1,  # Faster to link this one-time-use binary dynamically
+      deps=[op_gen] + deps)
 
   if override_file == None:
     srcs = []
@@ -180,14 +198,17 @@ def tf_gen_op_wrapper_cc(name, out_ops_file, pkg="",
     override_arg = "$(location " + override_file + ")"
   native.genrule(
       name=name + "_genrule",
-      outs=[out_ops_file + ".h", out_ops_file + ".cc",
-            out_ops_file + "_internal.h", out_ops_file + "_internal.cc"],
+      outs=[
+          out_ops_file + ".h", out_ops_file + ".cc",
+          out_ops_file + "_internal.h", out_ops_file + "_internal.cc"
+      ],
       srcs=srcs,
       tools=[":" + tool],
       cmd=("$(location :" + tool + ") $(location :" + out_ops_file + ".h) " +
            "$(location :" + out_ops_file + ".cc) " + override_arg + " " +
            str(include_internal_ops)))
 
+
 # Given a list of "op_lib_names" (a list of files in the ops directory
 # without their .cc extensions), generate individual C++ .cc and .h
 # files for each of the ops files mentioned, and then generate a
@@ -235,59 +256,72 @@ def tf_gen_op_wrappers_cc(name,
   internalhdrs = []
   for n in op_lib_names:
     tf_gen_op_wrapper_cc(
-        n, "ops/" + n, pkg=pkg, op_gen=op_gen, override_file=override_file,
+        n,
+        "ops/" + n,
+        pkg=pkg,
+        op_gen=op_gen,
+        override_file=override_file,
         include_internal_ops=include_internal_ops)
     subsrcs += ["ops/" + n + ".cc"]
     subhdrs += ["ops/" + n + ".h"]
     internalsrcs += ["ops/" + n + "_internal.cc"]
     internalhdrs += ["ops/" + n + "_internal.h"]
 
-  native.cc_library(name=name,
-                    srcs=subsrcs,
-                    hdrs=subhdrs,
-                    deps=deps + if_not_android([
-                        "//tensorflow/core:core_cpu",
-                        "//tensorflow/core:framework",
-                        "//tensorflow/core:lib",
-                        "//tensorflow/core:protos_all_cc",
-                    ]) + if_android([
-                        "//tensorflow/core:android_tensorflow_lib",
-                    ]),
-                    copts=tf_copts(),
-                    alwayslink=1,
-                    visibility=visibility)
-  native.cc_library(name=name + "_internal",
-                    srcs=internalsrcs,
-                    hdrs=internalhdrs,
-                    deps=deps + if_not_android([
-                        "//tensorflow/core:core_cpu",
-                        "//tensorflow/core:framework",
-                        "//tensorflow/core:lib",
-                        "//tensorflow/core:protos_all_cc",
-                    ]) + if_android([
-                        "//tensorflow/core:android_tensorflow_lib",
-                    ]),
-                    copts=tf_copts(),
-                    alwayslink=1,
-                    visibility=["//tensorflow:internal"])
+  native.cc_library(
+      name=name,
+      srcs=subsrcs,
+      hdrs=subhdrs,
+      deps=deps + if_not_android([
+          "//tensorflow/core:core_cpu",
+          "//tensorflow/core:framework",
+          "//tensorflow/core:lib",
+          "//tensorflow/core:protos_all_cc",
+      ]) + if_android([
+          "//tensorflow/core:android_tensorflow_lib",
+      ]),
+      copts=tf_copts(),
+      alwayslink=1,
+      visibility=visibility)
+  native.cc_library(
+      name=name + "_internal",
+      srcs=internalsrcs,
+      hdrs=internalhdrs,
+      deps=deps + if_not_android([
+          "//tensorflow/core:core_cpu",
+          "//tensorflow/core:framework",
+          "//tensorflow/core:lib",
+          "//tensorflow/core:protos_all_cc",
+      ]) + if_android([
+          "//tensorflow/core:android_tensorflow_lib",
+      ]),
+      copts=tf_copts(),
+      alwayslink=1,
+      visibility=["//tensorflow:internal"])
+
 
 # Invoke this rule in .../tensorflow/python to build the wrapper library.
-def tf_gen_op_wrapper_py(name, out=None, hidden=None, visibility=None, deps=[],
-                         require_shape_functions=False, hidden_file=None,
+def tf_gen_op_wrapper_py(name,
+                         out=None,
+                         hidden=None,
+                         visibility=None,
+                         deps=[],
+                         require_shape_functions=False,
+                         hidden_file=None,
                          generated_target_name=None):
   # Construct a cc_binary containing the specified ops.
   tool_name = "gen_" + name + "_py_wrappers_cc"
   if not deps:
     deps = ["//tensorflow/core:" + name + "_op_lib"]
   native.cc_binary(
-      name = tool_name,
-      linkopts = ["-lm"],
-      copts = tf_copts(),
-      linkstatic = 1,   # Faster to link this one-time-use binary dynamically
-      deps = (["//tensorflow/core:framework",
-               "//tensorflow/python:python_op_gen_main"] + deps),
-      visibility = ["//tensorflow:internal"],
-  )
+      name=tool_name,
+      linkopts=["-lm"],
+      copts=tf_copts(),
+      linkstatic=1,  # Faster to link this one-time-use binary dynamically
+      deps=([
+          "//tensorflow/core:framework",
+          "//tensorflow/python:python_op_gen_main"
+      ] + deps),
+      visibility=["//tensorflow:internal"],)
 
   # Invoke the previous cc_binary to generate a python file.
   if not out:
@@ -299,8 +333,8 @@ def tf_gen_op_wrapper_py(name, out=None, hidden=None, visibility=None, deps=[],
         name=name + "_pygenrule",
         outs=[out],
         tools=[tool_name],
-        cmd=("$(location " + tool_name + ") " + ",".join(hidden)
-             + " " + ("1" if require_shape_functions else "0") + " > $@"))
+        cmd=("$(location " + tool_name + ") " + ",".join(hidden) + " " +
+             ("1" if require_shape_functions else "0") + " > $@"))
   elif hidden_file:
     # `hidden_file` is file containing a list of op names to be hidden in the
     # generated module.
@@ -309,77 +343,120 @@ def tf_gen_op_wrapper_py(name, out=None, hidden=None, visibility=None, deps=[],
         outs=[out],
         srcs=[hidden_file],
         tools=[tool_name],
-        cmd=("$(location " + tool_name + ") @$(location "
-             + hidden_file + ") " + ("1" if require_shape_functions else "0")
-             + " > $@"))
+        cmd=("$(location " + tool_name + ") @$(location " + hidden_file + ") " +
+             ("1" if require_shape_functions else "0") + " > $@"))
   else:
     # No ops should be hidden in the generated module.
     native.genrule(
         name=name + "_pygenrule",
         outs=[out],
         tools=[tool_name],
-        cmd=("$(location " + tool_name + ") "
-             + ("1" if require_shape_functions else "0") + " > $@"))
+        cmd=("$(location " + tool_name + ") " +
+             ("1" if require_shape_functions else "0") + " > $@"))
 
   # Make a py_library out of the generated python file.
   if not generated_target_name:
     generated_target_name = name
-  native.py_library(name=generated_target_name,
-                    srcs=[out],
-                    srcs_version="PY2AND3",
-                    visibility=visibility,
-                    deps=[
-                        "//tensorflow/python:framework_for_generated_wrappers_v2",
-                    ],)
+  native.py_library(
+      name=generated_target_name,
+      srcs=[out],
+      srcs_version="PY2AND3",
+      visibility=visibility,
+      deps=[
+          "//tensorflow/python:framework_for_generated_wrappers_v2",
+      ],)
+
 
 # Define a bazel macro that creates cc_test for tensorflow.
 # TODO(opensource): we need to enable this to work around the hidden symbol
 # __cudaRegisterFatBinary error. Need more investigations.
-def tf_cc_test(name, srcs, deps, linkstatic=0, tags=[], data=[], size="medium",
-               suffix="", args=None, linkopts=[]):
-  native.cc_test(name="%s%s" % (name, suffix),
-                 srcs=srcs,
-                 size=size,
-                 args=args,
-                 copts=tf_copts(),
-                 data=data,
-                 deps=deps,
-                 linkopts=["-lpthread", "-lm"] + linkopts,
-                 linkstatic=linkstatic,
-                 tags=tags)
+def tf_cc_test(name,
+               srcs,
+               deps,
+               linkstatic=0,
+               tags=[],
+               data=[],
+               size="medium",
+               suffix="",
+               args=None,
+               linkopts=[]):
+  native.cc_test(
+      name="%s%s" % (name, suffix),
+      srcs=srcs,
+      size=size,
+      args=args,
+      copts=tf_copts(),
+      data=data,
+      deps=deps,
+      linkopts=["-lpthread", "-lm"] + linkopts,
+      linkstatic=linkstatic,
+      tags=tags)
+
 
 # Part of the testing workflow requires a distinguishable name for the build
 # rules that involve a GPU, even if otherwise identical to the base rule.
-def tf_cc_test_gpu(name, srcs, deps, linkstatic=0, tags=[], data=[],
-                   size="medium", suffix="", args=None):
-  tf_cc_test(name, srcs, deps, linkstatic=linkstatic, tags=tags, data=data,
-             size=size, suffix=suffix, args=args)
-
-def tf_cuda_cc_test(name, srcs=[], deps=[], tags=[], data=[], size="medium",
-                    linkstatic=0, args=[], linkopts=[]):
-  tf_cc_test(name=name,
-             srcs=srcs,
-             deps=deps,
-             tags=tags + ["manual"],
-             data=data,
-             size=size,
-             linkstatic=linkstatic,
-             linkopts=linkopts,
-             args=args)
-  tf_cc_test(name=name,
-             srcs=srcs,
-             suffix="_gpu",
-             deps=deps + if_cuda(["//tensorflow/core:gpu_runtime"]),
-             linkstatic=if_cuda(1, 0),
-             tags=tags + tf_cuda_tests_tags(),
-             data=data,
-             size=size,
-             linkopts=linkopts,
-             args=args)
+def tf_cc_test_gpu(name,
+                   srcs,
+                   deps,
+                   linkstatic=0,
+                   tags=[],
+                   data=[],
+                   size="medium",
+                   suffix="",
+                   args=None):
+  tf_cc_test(
+      name,
+      srcs,
+      deps,
+      linkstatic=linkstatic,
+      tags=tags,
+      data=data,
+      size=size,
+      suffix=suffix,
+      args=args)
+
+
+def tf_cuda_cc_test(name,
+                    srcs=[],
+                    deps=[],
+                    tags=[],
+                    data=[],
+                    size="medium",
+                    linkstatic=0,
+                    args=[],
+                    linkopts=[]):
+  tf_cc_test(
+      name=name,
+      srcs=srcs,
+      deps=deps,
+      tags=tags + ["manual"],
+      data=data,
+      size=size,
+      linkstatic=linkstatic,
+      linkopts=linkopts,
+      args=args)
+  tf_cc_test(
+      name=name,
+      srcs=srcs,
+      suffix="_gpu",
+      deps=deps + if_cuda(["//tensorflow/core:gpu_runtime"]),
+      linkstatic=if_cuda(1, 0),
+      tags=tags + tf_cuda_tests_tags(),
+      data=data,
+      size=size,
+      linkopts=linkopts,
+      args=args)
+
 
 # Create a cc_test for each of the tensorflow tests listed in "tests"
-def tf_cc_tests(srcs, deps, name='', linkstatic=0, tags=[], size="medium",
-                args=None, linkopts=[]):
+def tf_cc_tests(srcs,
+                deps,
+                name="",
+                linkstatic=0,
+                tags=[],
+                size="medium",
+                args=None,
+                linkopts=[]):
   for src in srcs:
     tf_cc_test(
         name=src_to_test_name(src),
@@ -391,17 +468,35 @@ def tf_cc_tests(srcs, deps, name='', linkstatic=0, tags=[], size="medium",
         args=args,
         linkopts=linkopts)
 
-def tf_cc_test_mkl(srcs, deps, name='', linkstatic=0, tags=[], size="medium",
-                    args=None):
+
+def tf_cc_test_mkl(srcs,
+                   deps,
+                   name="",
+                   linkstatic=0,
+                   tags=[],
+                   size="medium",
+                   args=None):
   if_mkl(tf_cc_tests(srcs, deps, linkstatic, tags=tags, size=size, args=args))
 
-def tf_cc_tests_gpu(srcs, deps, name='', linkstatic=0, tags=[], size="medium",
+
+def tf_cc_tests_gpu(srcs,
+                    deps,
+                    name="",
+                    linkstatic=0,
+                    tags=[],
+                    size="medium",
                     args=None):
   tf_cc_tests(srcs, deps, linkstatic, tags=tags, size=size, args=args)
 
 
-def tf_cuda_cc_tests(srcs, deps, name='', tags=[], size="medium", linkstatic=0,
-                     args=None, linkopts=[]):
+def tf_cuda_cc_tests(srcs,
+                     deps,
+                     name="",
+                     tags=[],
+                     size="medium",
+                     linkstatic=0,
+                     args=None,
+                     linkopts=[]):
   for src in srcs:
     tf_cuda_cc_test(
         name=src_to_test_name(src),
@@ -413,48 +508,52 @@ def tf_cuda_cc_tests(srcs, deps, name='', tags=[], size="medium", linkstatic=0,
         args=args,
         linkopts=linkopts)
 
+
 def _cuda_copts():
-    """Gets the appropriate set of copts for (maybe) CUDA compilation.
+  """Gets the appropriate set of copts for (maybe) CUDA compilation.
 
     If we're doing CUDA compilation, returns copts for our particular CUDA
     compiler.  If we're not doing CUDA compilation, returns an empty list.
 
     """
-    return cuda_default_copts() + select({
-        "//conditions:default": [],
-        "@local_config_cuda//cuda:using_nvcc": (
-            [
-                "-nvcc_options=relaxed-constexpr",
-                "-nvcc_options=ftz=true",
-            ]
-        ),
-        "@local_config_cuda//cuda:using_clang": (
-            [
-                "-fcuda-flush-denormals-to-zero",
-            ]
-        ),
-    })
+  return cuda_default_copts() + select({
+      "//conditions:default": [],
+      "@local_config_cuda//cuda:using_nvcc": ([
+          "-nvcc_options=relaxed-constexpr",
+          "-nvcc_options=ftz=true",
+      ]),
+      "@local_config_cuda//cuda:using_clang": ([
+          "-fcuda-flush-denormals-to-zero",
+      ]),
+  })
+
 
 # Build defs for TensorFlow kernels
 
+
 # When this target is built using --config=cuda, a cc_library is built
 # that passes -DGOOGLE_CUDA=1 and '-x cuda', linking in additional
 # libraries needed by GPU kernels.
-def tf_gpu_kernel_library(srcs, copts=[], cuda_copts=[], deps=[], hdrs=[],
+def tf_gpu_kernel_library(srcs,
+                          copts=[],
+                          cuda_copts=[],
+                          deps=[],
+                          hdrs=[],
                           **kwargs):
   copts = copts + _cuda_copts() + if_cuda(cuda_copts) + tf_copts()
 
   native.cc_library(
-      srcs = srcs,
-      hdrs = hdrs,
-      copts = copts,
-      deps = deps + if_cuda([
+      srcs=srcs,
+      hdrs=hdrs,
+      copts=copts,
+      deps=deps + if_cuda([
           "//tensorflow/core:cuda",
           "//tensorflow/core:gpu_lib",
       ]),
       alwayslink=1,
       **kwargs)
 
+
 def tf_cuda_library(deps=None, cuda_deps=None, copts=None, **kwargs):
   """Generate a cc_library with a conditional set of CUDA dependencies.
 
@@ -479,15 +578,23 @@ def tf_cuda_library(deps=None, cuda_deps=None, copts=None, **kwargs):
     copts = []
 
   native.cc_library(
-      deps = deps + if_cuda(cuda_deps + [
+      deps=deps + if_cuda(cuda_deps + [
           "//tensorflow/core:cuda",
           "@local_config_cuda//cuda:cuda_headers"
       ]),
-      copts = copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1"]),
+      copts=copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1"]),
       **kwargs)
 
-def tf_kernel_library(name, prefix=None, srcs=None, gpu_srcs=None, hdrs=None,
-                      deps=None, alwayslink=1, copts=tf_copts(), **kwargs):
+
+def tf_kernel_library(name,
+                      prefix=None,
+                      srcs=None,
+                      gpu_srcs=None,
+                      hdrs=None,
+                      deps=None,
+                      alwayslink=1,
+                      copts=tf_copts(),
+                      **kwargs):
   """A rule to build a TensorFlow OpKernel.
 
   May either specify srcs/hdrs or prefix.  Similar to tf_cuda_library,
@@ -517,38 +624,59 @@ def tf_kernel_library(name, prefix=None, srcs=None, gpu_srcs=None, hdrs=None,
     deps = []
 
   if prefix:
-    if native.glob([prefix + "*.cu.cc"], exclude = ["*test*"]):
+    if native.glob([prefix + "*.cu.cc"], exclude=["*test*"]):
       if not gpu_srcs:
         gpu_srcs = []
-      gpu_srcs = gpu_srcs + native.glob([prefix + "*.cu.cc", prefix + "*.h"],
-                                        exclude = [prefix + "*test*"])
-    srcs = srcs + native.glob([prefix + "*.cc"],
-                              exclude = [prefix + "*test*", prefix + "*.cu.cc"])
-    hdrs = hdrs + native.glob([prefix + "*.h"], exclude = [prefix + "*test*",
-                                                           prefix + "*.cu.h"])
+      gpu_srcs = gpu_srcs + native.glob(
+          [prefix + "*.cu.cc", prefix + "*.h"], exclude=[prefix + "*test*"])
+    srcs = srcs + native.glob(
+        [prefix + "*.cc"], exclude=[prefix + "*test*", prefix + "*.cu.cc"])
+    hdrs = hdrs + native.glob(
+        [prefix + "*.h"], exclude=[prefix + "*test*", prefix + "*.cu.h"])
 
   cuda_deps = ["//tensorflow/core:gpu_lib"]
   if gpu_srcs:
     for gpu_src in gpu_srcs:
       if gpu_src.endswith(".cc") and not gpu_src.endswith(".cu.cc"):
-        fail("{} not allowed in gpu_srcs. .cc sources must end with .cu.cc".format(gpu_src))
+        fail("{} not allowed in gpu_srcs. .cc sources must end with .cu.cc".
+             format(gpu_src))
     tf_gpu_kernel_library(
-        name = name + "_gpu",
-        srcs = gpu_srcs,
-        deps = deps,
-        **kwargs)
+        name=name + "_gpu", srcs=gpu_srcs, deps=deps, **kwargs)
     cuda_deps.extend([":" + name + "_gpu"])
   tf_cuda_library(
-      name = name,
-      srcs = srcs,
-      hdrs = hdrs,
-      copts = copts,
-      cuda_deps = cuda_deps,
-      linkstatic = 1,   # Needed since alwayslink is broken in bazel b/27630669
-      alwayslink = alwayslink,
-      deps = deps,
+      name=name,
+      srcs=srcs,
+      hdrs=hdrs,
+      copts=copts,
+      cuda_deps=cuda_deps,
+      linkstatic=1,  # Needed since alwayslink is broken in bazel b/27630669
+      alwayslink=alwayslink,
+      deps=deps,
       **kwargs)
 
+
+def tf_mkl_kernel_library(name,
+                          prefix=None,
+                          srcs=None,
+                          gpu_srcs=None,
+                          hdrs=None,
+                          deps=None,
+                          alwayslink=1,
+                          copts=tf_copts(),
+                          **kwargs):
+  if_mkl(
+      tf_kernel_library(
+          name,
+          prefix=prefix,
+          srcs=srcs,
+          gpu_srcs=gpu_srcs,
+          hdrs=hdrs,
+          deps=deps,
+          alwayslink=alwayslink,
+          copts=copts,
+          **kwargs))
+
+
 # Bazel rules for building swig files.
 def _py_wrap_cc_impl(ctx):
   srcs = ctx.files.srcs
@@ -564,59 +692,61 @@ def _py_wrap_cc_impl(ctx):
   inputs += ctx.files.toolchain_deps
   swig_include_dirs = set(_get_repository_roots(ctx, inputs))
   swig_include_dirs += sorted([f.dirname for f in ctx.files._swiglib])
-  args = ["-c++",
-          "-python",
-          "-module", module_name,
-          "-o", ctx.outputs.cc_out.path,
-          "-outdir", ctx.outputs.py_out.dirname]
+  args = [
+      "-c++", "-python", "-module", module_name, "-o", ctx.outputs.cc_out.path,
+      "-outdir", ctx.outputs.py_out.dirname
+  ]
   args += ["-l" + f.path for f in ctx.files.swig_includes]
   args += ["-I" + i for i in swig_include_dirs]
   args += [src.path]
-  outputs = [ctx.outputs.cc_out,
-             ctx.outputs.py_out]
-  ctx.action(executable=ctx.executable._swig,
-             arguments=args,
-             inputs=list(inputs),
-             outputs=outputs,
-             mnemonic="PythonSwig",
-             progress_message="SWIGing " + src.path)
+  outputs = [ctx.outputs.cc_out, ctx.outputs.py_out]
+  ctx.action(
+      executable=ctx.executable._swig,
+      arguments=args,
+      inputs=list(inputs),
+      outputs=outputs,
+      mnemonic="PythonSwig",
+      progress_message="SWIGing " + src.path)
   return struct(files=set(outputs))
 
+
 _py_wrap_cc = rule(
-    attrs = {
-        "srcs": attr.label_list(
-            mandatory = True,
-            allow_files = True,
-        ),
-        "swig_includes": attr.label_list(
-            cfg = "data",
-            allow_files = True,
-        ),
-        "deps": attr.label_list(
-            allow_files = True,
-            providers = ["cc"],
-        ),
-        "toolchain_deps": attr.label_list(
-            allow_files = True,
-        ),
-        "module_name": attr.string(mandatory = True),
-        "py_module_name": attr.string(mandatory = True),
-        "_swig": attr.label(
-            default = Label("@swig//:swig"),
-            executable = True,
-            cfg = "host",
-        ),
-        "_swiglib": attr.label(
-            default = Label("@swig//:templates"),
-            allow_files = True,
-        ),
+    attrs={
+        "srcs":
+            attr.label_list(
+                mandatory=True,
+                allow_files=True,),
+        "swig_includes":
+            attr.label_list(
+                cfg="data",
+                allow_files=True,),
+        "deps":
+            attr.label_list(
+                allow_files=True,
+                providers=["cc"],),
+        "toolchain_deps":
+            attr.label_list(
+                allow_files=True,),
+        "module_name":
+            attr.string(mandatory=True),
+        "py_module_name":
+            attr.string(mandatory=True),
+        "_swig":
+            attr.label(
+                default=Label("@swig//:swig"),
+                executable=True,
+                cfg="host",),
+        "_swiglib":
+            attr.label(
+                default=Label("@swig//:templates"),
+                allow_files=True,),
     },
-    outputs = {
+    outputs={
         "cc_out": "%{module_name}.cc",
         "py_out": "%{py_module_name}.py",
     },
-    implementation = _py_wrap_cc_impl,
-)
+    implementation=_py_wrap_cc_impl,)
+
 
 def _get_repository_roots(ctx, files):
   """Returns abnormal root directories under which files reside.
@@ -647,6 +777,7 @@ def _get_repository_roots(ctx, files):
       result[root] -= 1
   return [k for v, k in sorted([(v, k) for k, v in result.items()])]
 
+
 # Bazel rule for collecting the header files that a target depends on.
 def _transitive_hdrs_impl(ctx):
   outputs = set()
@@ -654,30 +785,27 @@ def _transitive_hdrs_impl(ctx):
     outputs += dep.cc.transitive_headers
   return struct(files=outputs)
 
+
 _transitive_hdrs = rule(
-    attrs = {
+    attrs={
         "deps": attr.label_list(
-            allow_files = True,
-            providers = ["cc"],
-        ),
+            allow_files=True,
+            providers=["cc"],),
     },
-    implementation = _transitive_hdrs_impl,
-)
+    implementation=_transitive_hdrs_impl,)
+
 
 def transitive_hdrs(name, deps=[], **kwargs):
-  _transitive_hdrs(name=name + "_gather",
-                   deps=deps)
-  native.filegroup(name=name,
-                   srcs=[":" + name + "_gather"])
+  _transitive_hdrs(name=name + "_gather", deps=deps)
+  native.filegroup(name=name, srcs=[":" + name + "_gather"])
+
 
 # Create a header only library that includes all the headers exported by
 # the libraries in deps.
 def cc_header_only_library(name, deps=[], **kwargs):
-  _transitive_hdrs(name=name + "_gather",
-                   deps=deps)
-  native.cc_library(name=name,
-                    hdrs=[":" + name + "_gather"],
-                    **kwargs)
+  _transitive_hdrs(name=name + "_gather", deps=deps)
+  native.cc_library(name=name, hdrs=[":" + name + "_gather"], **kwargs)
+
 
 def tf_custom_op_library_additional_deps():
   return [
@@ -686,6 +814,7 @@ def tf_custom_op_library_additional_deps():
       "//tensorflow/core:framework_headers_lib",
   ]
 
+
 # Traverse the dependency graph along the "deps" attribute of the
 # target and return a struct with one field called 'tf_collected_deps'.
 # tf_collected_deps will be the union of the deps of the current target
@@ -699,14 +828,16 @@ def _collect_deps_aspect_impl(target, ctx):
         alldeps = alldeps | dep.tf_collected_deps
   return struct(tf_collected_deps=alldeps)
 
+
 collect_deps_aspect = aspect(
-    implementation=_collect_deps_aspect_impl,
-    attr_aspects=["deps"])
+    implementation=_collect_deps_aspect_impl, attr_aspects=["deps"])
+
 
 def _dep_label(dep):
   label = dep.label
   return label.package + ":" + label.name
 
+
 # This rule checks that the transitive dependencies of targets listed
 # in the 'deps' attribute don't depend on the targets listed in
 # the 'disallowed_deps' attribute.
@@ -718,23 +849,23 @@ def _check_deps_impl(ctx):
     for dep in input_dep.tf_collected_deps:
       for disallowed_dep in disallowed_deps:
         if dep == disallowed_dep.label:
-          fail(_dep_label(input_dep) + " cannot depend on " +
-               _dep_label(disallowed_dep))
+          fail(
+              _dep_label(input_dep) + " cannot depend on " + _dep_label(
+                  disallowed_dep))
   return struct()
 
+
 check_deps = rule(
     _check_deps_impl,
-    attrs = {
-        "deps": attr.label_list(
-            aspects=[collect_deps_aspect],
-            mandatory = True,
-            allow_files = True
-        ),
-        "disallowed_deps": attr.label_list(
-            mandatory = True,
-            allow_files = True
-        )},
-)
+    attrs={
+        "deps":
+            attr.label_list(
+                aspects=[collect_deps_aspect], mandatory=True,
+                allow_files=True),
+        "disallowed_deps":
+            attr.label_list(mandatory=True, allow_files=True)
+    },)
+
 
 # Helper to build a dynamic library (.so) from the sources containing
 # implementations of custom ops and kernels.
@@ -747,33 +878,42 @@ def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[]):
   if gpu_srcs:
     basename = name.split(".")[0]
     native.cc_library(
-        name = basename + "_gpu",
-        srcs = gpu_srcs,
-        copts = _cuda_copts(),
-        deps = deps + if_cuda(cuda_deps))
+        name=basename + "_gpu",
+        srcs=gpu_srcs,
+        copts=_cuda_copts(),
+        deps=deps + if_cuda(cuda_deps))
     cuda_deps.extend([":" + basename + "_gpu"])
 
-  check_deps(name=name+"_check_deps",
-             deps=deps + if_cuda(cuda_deps),
-             disallowed_deps=["//tensorflow/core:framework",
-                              "//tensorflow/core:lib"])
-
-  native.cc_binary(name=name,
-                   srcs=srcs,
-                   deps=deps + if_cuda(cuda_deps),
-                   data=[name + "_check_deps"],
-                   copts=tf_copts(),
-                   linkshared=1,
-                   linkopts = select({
-                       "//conditions:default": [
-                           "-lm",
-                       ],
-                       "//tensorflow:darwin": [],
-                   }),
-  )
-
-def tf_custom_op_py_library(name, srcs=[], dso=[], kernels=[],
-                            srcs_version="PY2AND3", visibility=None, deps=[]):
+  check_deps(
+      name=name + "_check_deps",
+      deps=deps + if_cuda(cuda_deps),
+      disallowed_deps=[
+          "//tensorflow/core:framework",
+          "//tensorflow/core:lib"
+      ])
+
+  native.cc_binary(
+      name=name,
+      srcs=srcs,
+      deps=deps + if_cuda(cuda_deps),
+      data=[name + "_check_deps"],
+      copts=tf_copts(),
+      linkshared=1,
+      linkopts=select({
+          "//conditions:default": [
+              "-lm",
+          ],
+          "//tensorflow:darwin": [],
+      }),)
+
+
+def tf_custom_op_py_library(name,
+                            srcs=[],
+                            dso=[],
+                            kernels=[],
+                            srcs_version="PY2AND3",
+                            visibility=None,
+                            deps=[]):
   kernels = kernels  # unused argument
   native.py_library(
       name=name,
@@ -781,86 +921,103 @@ def tf_custom_op_py_library(name, srcs=[], dso=[], kernels=[],
       srcs=srcs,
       srcs_version=srcs_version,
       visibility=visibility,
-      deps=deps,
-  )
+      deps=deps,)
+
 
 def tf_extension_linkopts():
   return []  # No extension link opts
 
+
 def tf_extension_copts():
   return []  # No extension c opts
 
-def tf_py_wrap_cc(name, srcs, swig_includes=[], deps=[], copts=[], **kwargs):
+
+def tf_py_wrap_cc(name,
+                             srcs,
+                             swig_includes=[],
+                             deps=[],
+                             copts=[],
+                             **kwargs):
   module_name = name.split("/")[-1]
   # Convert a rule name such as foo/bar/baz to foo/bar/_baz.so
   # and use that as the name for the rule producing the .so file.
   cc_library_name = "/".join(name.split("/")[:-1] + ["_" + module_name + ".so"])
-  cc_library_pyd_name = "/".join(name.split("/")[:-1] + ["_" + module_name + ".pyd"])
+  cc_library_pyd_name = "/".join(
+      name.split("/")[:-1] + ["_" + module_name + ".pyd"])
   extra_deps = []
-  _py_wrap_cc(name=name + "_py_wrap",
-              srcs=srcs,
-              swig_includes=swig_includes,
-              deps=deps + extra_deps,
-              toolchain_deps=["//tools/defaults:crosstool"],
-              module_name=module_name,
-              py_module_name=name)
+  _py_wrap_cc(
+      name=name + "_py_wrap",
+      srcs=srcs,
+      swig_includes=swig_includes,
+      deps=deps + extra_deps,
+      toolchain_deps=["//tools/defaults:crosstool"],
+      module_name=module_name,
+      py_module_name=name)
   extra_linkopts = select({
       "@local_config_cuda//cuda:darwin": [
           "-Wl,-exported_symbols_list",
           "//tensorflow:tf_exported_symbols.lds"
       ],
-      "//tensorflow:windows": [
-      ],
+      str(Label("//tensorflow:windows")): [],
       "//conditions:default": [
           "-Wl,--version-script",
           "//tensorflow:tf_version_script.lds"
-      ]})
+      ]
+  })
   extra_deps += select({
       "@local_config_cuda//cuda:darwin": [
-        "//tensorflow:tf_exported_symbols.lds"
-      ],
-      "//tensorflow:windows": [
+          "//tensorflow:tf_exported_symbols.lds"
       ],
+      "//tensorflow:windows": [],
       "//conditions:default": [
-        "//tensorflow:tf_version_script.lds"
+          "//tensorflow:tf_version_script.lds"
       ]
   })
 
   native.cc_binary(
       name=cc_library_name,
       srcs=[module_name + ".cc"],
-      copts=(copts + ["-Wno-self-assign",
-                      "-Wno-sign-compare",
-                      "-Wno-write-strings"]
-             + tf_extension_copts()),
+      copts=(copts + [
+          "-Wno-self-assign", "-Wno-sign-compare", "-Wno-write-strings"
+      ] + tf_extension_copts()),
       linkopts=tf_extension_linkopts() + extra_linkopts,
       linkstatic=1,
       linkshared=1,
       deps=deps + extra_deps)
   native.genrule(
-      name = "gen_" + cc_library_pyd_name,
-      srcs = [":" + cc_library_name],
-      outs = [cc_library_pyd_name],
-      cmd = "cp $< $@",
-  )
-  native.py_library(name=name,
-                    srcs=[":" + name + ".py"],
-                    srcs_version="PY2AND3",
-                    data=select({
-                      "//tensorflow:windows": [":" + cc_library_pyd_name],
-                      "//conditions:default": [":" + cc_library_name],
-                    }))
+      name="gen_" + cc_library_pyd_name,
+      srcs=[":" + cc_library_name],
+      outs=[cc_library_pyd_name],
+      cmd="cp $< $@",)
+  native.py_library(
+      name=name,
+      srcs=[":" + name + ".py"],
+      srcs_version="PY2AND3",
+      data=select({
+          "//tensorflow:windows": [":" + cc_library_pyd_name],
+          "//conditions:default": [":" + cc_library_name],
+      }))
+
 
 def py_test(deps=[], **kwargs):
   native.py_test(
       deps=select({
-          "//conditions:default" : deps,
-          "//tensorflow:no_tensorflow_py_deps" : []
+          "//conditions:default": deps,
+          "//tensorflow:no_tensorflow_py_deps": []
       }),
       **kwargs)
 
-def tf_py_test(name, srcs, size="medium", data=[], main=None, args=[],
-               tags=[], shard_count=1, additional_deps=[], flaky=0,
+
+def tf_py_test(name,
+               srcs,
+               size="medium",
+               data=[],
+               main=None,
+               args=[],
+               tags=[],
+               shard_count=1,
+               additional_deps=[],
+               flaky=0,
                xla_enabled=False):
   if xla_enabled:
     additional_deps += tf_additional_xla_deps_py()
@@ -875,46 +1032,67 @@ def tf_py_test(name, srcs, size="medium", data=[], main=None, args=[],
       shard_count=shard_count,
       data=data,
       deps=select({
-          "//conditions:default" : [
-            "//tensorflow/python:extra_py_tests_deps",
-            "//tensorflow/python:gradient_checker",
+          "//conditions:default": [
+              "//tensorflow/python:extra_py_tests_deps",
+              "//tensorflow/python:gradient_checker",
           ] + additional_deps,
-          "//tensorflow:no_tensorflow_py_deps" : []
+          "//tensorflow:no_tensorflow_py_deps": []
       }),
       flaky=flaky,
       srcs_version="PY2AND3")
 
-def cuda_py_test(name, srcs, size="medium", data=[], main=None, args=[],
-                 shard_count=1, additional_deps=[], tags=[], flaky=0,
+
+def cuda_py_test(name,
+                 srcs,
+                 size="medium",
+                 data=[],
+                 main=None,
+                 args=[],
+                 shard_count=1,
+                 additional_deps=[],
+                 tags=[],
+                 flaky=0,
                  xla_enabled=False):
   test_tags = tags + tf_cuda_tests_tags()
-  tf_py_test(name=name,
-             size=size,
-             srcs=srcs,
-             data=data,
-             main=main,
-             args=args,
-             tags=test_tags,
-             shard_count=shard_count,
-             additional_deps=additional_deps,
-             flaky=flaky,
-             xla_enabled=xla_enabled)
-
-def sycl_py_test(name, srcs, size="medium", data=[], main=None, args=[],
-                 shard_count=1, additional_deps=[], tags=[], flaky=0,
+  tf_py_test(
+      name=name,
+      size=size,
+      srcs=srcs,
+      data=data,
+      main=main,
+      args=args,
+      tags=test_tags,
+      shard_count=shard_count,
+      additional_deps=additional_deps,
+      flaky=flaky,
+      xla_enabled=xla_enabled)
+
+
+def sycl_py_test(name,
+                 srcs,
+                 size="medium",
+                 data=[],
+                 main=None,
+                 args=[],
+                 shard_count=1,
+                 additional_deps=[],
+                 tags=[],
+                 flaky=0,
                  xla_enabled=False):
- test_tags = tags + tf_sycl_tests_tags()
- tf_py_test(name=name,
-            size=size,
-            srcs=srcs,
-            data=data,
-            main=main,
-            args=args,
-            tags=test_tags,
-            shard_count=shard_count,
-            additional_deps=additional_deps,
-            flaky=flaky,
-            xla_enabled=xla_enabled)
+  test_tags = tags + tf_sycl_tests_tags()
+  tf_py_test(
+      name=name,
+      size=size,
+      srcs=srcs,
+      data=data,
+      main=main,
+      args=args,
+      tags=test_tags,
+      shard_count=shard_count,
+      additional_deps=additional_deps,
+      flaky=flaky,
+      xla_enabled=xla_enabled)
+
 
 def py_tests(name,
              srcs,
@@ -929,22 +1107,39 @@ def py_tests(name,
     test_name = src.split("/")[-1].split(".")[0]
     if prefix:
       test_name = "%s_%s" % (prefix, test_name)
-    tf_py_test(name=test_name,
-               size=size,
-               srcs=[src],
-               main=src,
-               tags=tags,
-               shard_count=shard_count,
-               data=data,
-               additional_deps=additional_deps,
-               xla_enabled=xla_enabled)
-
-def cuda_py_tests(name, srcs, size="medium", additional_deps=[], data=[],
-                  shard_count=1, tags=[], prefix="", xla_enabled=False):
+    tf_py_test(
+        name=test_name,
+        size=size,
+        srcs=[src],
+        main=src,
+        tags=tags,
+        shard_count=shard_count,
+        data=data,
+        additional_deps=additional_deps,
+        xla_enabled=xla_enabled)
+
+
+def cuda_py_tests(name,
+                  srcs,
+                  size="medium",
+                  additional_deps=[],
+                  data=[],
+                  shard_count=1,
+                  tags=[],
+                  prefix="",
+                  xla_enabled=False):
   test_tags = tags + tf_cuda_tests_tags()
-  py_tests(name=name, size=size, srcs=srcs, additional_deps=additional_deps,
-           data=data, tags=test_tags, shard_count=shard_count,prefix=prefix,
-           xla_enabled=xla_enabled)
+  py_tests(
+      name=name,
+      size=size,
+      srcs=srcs,
+      additional_deps=additional_deps,
+      data=data,
+      tags=test_tags,
+      shard_count=shard_count,
+      prefix=prefix,
+      xla_enabled=xla_enabled)
+
 
 # Creates a genrule named <name> for running tools/proto_text's generator to
 # make the proto_text functions, for the protos passed in <srcs>.
@@ -952,40 +1147,46 @@ def cuda_py_tests(name, srcs, size="medium", additional_deps=[], data=[],
 # Return a struct with fields (hdrs, srcs) containing the names of the
 # generated files.
 def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs):
-  out_hdrs = ([p.replace(".proto", ".pb_text.h") for p in srcs] +
-              [p.replace(".proto", ".pb_text-impl.h") for p in srcs])
+  out_hdrs = (
+      [p.replace(".proto", ".pb_text.h")
+       for p in srcs] + [p.replace(".proto", ".pb_text-impl.h") for p in srcs])
   out_srcs = [p.replace(".proto", ".pb_text.cc") for p in srcs]
   native.genrule(
-        name = name,
-        srcs = srcs + ["//tensorflow/tools/proto_text:placeholder.txt"],
-        outs = out_hdrs + out_srcs,
-        cmd = "$(location //tensorflow/tools/proto_text:gen_proto_text_functions) " +
-              "$(@D) " + srcs_relative_dir + " $(SRCS)",
-        tools = ["//tensorflow/tools/proto_text:gen_proto_text_functions"],
-    )
+      name=name,
+      srcs=srcs + ["//tensorflow/tools/proto_text:placeholder.txt"],
+      outs=out_hdrs + out_srcs,
+      cmd=
+      "$(location //tensorflow/tools/proto_text:gen_proto_text_functions) "
+      + "$(@D) " + srcs_relative_dir + " $(SRCS)",
+      tools=[
+          "//tensorflow/tools/proto_text:gen_proto_text_functions"
+      ],)
   return struct(hdrs=out_hdrs, srcs=out_srcs)
 
+
 def tf_genrule_cmd_append_to_srcs(to_append):
-    return ("cat $(SRCS) > $(@) && " +
-            "echo >> $(@) && " +
-            "echo " + to_append + " >> $(@)")
+  return ("cat $(SRCS) > $(@) && " + "echo >> $(@) && " + "echo " + to_append +
+          " >> $(@)")
 
 
 def tf_version_info_genrule():
   native.genrule(
-      name = "version_info_gen",
-      srcs = [
+      name="version_info_gen",
+      srcs=[
           "//tensorflow/tools/git:gen/spec.json",
           "//tensorflow/tools/git:gen/head",
           "//tensorflow/tools/git:gen/branch_ref",
       ],
-      outs = ["util/version_info.cc"],
-      cmd = "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\"",
-      local = 1,
-      tools = ["//tensorflow/tools/git:gen_git_source.py"],
-  )
-
-def cc_library_with_android_deps(deps, android_deps=[],
-                                common_deps=[], **kwargs):
+      outs=["util/version_info.cc"],
+      cmd=
+      "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\"",
+      local=1,
+      tools=["//tensorflow/tools/git:gen_git_source.py"],)
+
+
+def cc_library_with_android_deps(deps,
+                                 android_deps=[],
+                                 common_deps=[],
+                                 **kwargs):
   deps = if_not_android(deps) + if_android(android_deps) + common_deps
   native.cc_library(deps=deps, **kwargs)
diff --git a/tensorflow/tools/ci_build/Dockerfile.android b/tensorflow/tools/ci_build/Dockerfile.android
index 4d46c672ab..c6679f7882 100644
--- a/tensorflow/tools/ci_build/Dockerfile.android
+++ b/tensorflow/tools/ci_build/Dockerfile.android
@@ -29,7 +29,8 @@ RUN mkdir -p ${ANDROID_DEV_HOME}
 ENV ANDROID_SDK_FILENAME tools_r25.2.5-linux.zip
 ENV ANDROID_SDK_URL https://dl.google.com/android/repository/${ANDROID_SDK_FILENAME}
 ENV ANDROID_API_LEVEL 23
-ENV ANDROID_BUILD_TOOLS_VERSION 25.0.1
+# Build Tools Version liable to change.
+ENV ANDROID_BUILD_TOOLS_VERSION 25.0.2
 ENV ANDROID_SDK_HOME ${ANDROID_DEV_HOME}/sdk
 ENV PATH ${PATH}:${ANDROID_SDK_HOME}/tools:${ANDROID_SDK_HOME}/platform-tools
 RUN cd ${ANDROID_DEV_HOME} && \
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 751f7de9a1..e0a1391d6e 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -269,7 +269,7 @@ pip install --upgrade pip==8.1.2
 
 # Force tensorflow reinstallation. Otherwise it may not get installed from
 # last build if it had the same version number as previous build.
-PIP_FLAGS="--upgrade --force-reinstall --no-deps"
+PIP_FLAGS="--upgrade --force-reinstall"
 pip install -v ${PIP_FLAGS} ${WHL_PATH} || \
     die "pip install (forcing to reinstall tensorflow) FAILED"
 echo "Successfully installed pip package ${WHL_PATH}"
diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index 662de93c16..b993747521 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
@@ -54,4 +54,4 @@ export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/extras/CUPT
 export PATH="/c/tools/cuda/bin:$PATH"
 
 # Set the common build options on Windows
-export BUILD_OPTS='--cpu=x64_windows_msvc --host_cpu=x64_windows_msvc --copt=/w --verbose_failures --experimental_ui'
+export BUILD_OPTS='--cpu=x64_windows_msvc --host_cpu=x64_windows_msvc --copt=-w --host_copt=-w --verbose_failures --experimental_ui'
diff --git a/tensorflow/tools/ci_build/windows/cpu/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/cpu/bazel/common_env.sh
deleted file mode 100644
index 6e7e555065..0000000000
--- a/tensorflow/tools/ci_build/windows/cpu/bazel/common_env.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# This script assumes the standard setup on tensorflow Jenkins windows machines.
-# It is NOT guaranteed to work on any other machine. Use at your own risk!
-#
-# REQUIREMENTS:
-# * All installed in standard locations:
-#   - JDK8, and JAVA_HOME set.
-#   - Microsoft Visual Studio 2015 Community Edition
-#   - Msys2
-#   - Anaconda3
-# * Bazel windows executable copied as "bazel.exe" and included in PATH.
-
-# All commands shall pass, and all should be visible.
-set -x
-set -e
-
-# Use a temporary directory with a short name.
-export TMPDIR="C:/tmp"
-mkdir -p "$TMPDIR"
-
-# Set bash path
-export BAZEL_SH="C:/tools/msys64/usr/bin/bash"
-
-# Set Python path for ./configure
-export PYTHON_BIN_PATH="C:/Program Files/Anaconda3/python"
-
-# Set Python path for cc_configure.bzl
-export BAZEL_PYTHON="C:/Program Files/Anaconda3/python"
-
-# Set Visual Studio path
-export BAZEL_VS="C:/Program Files (x86)/Microsoft Visual Studio 14.0"
-
-# Add python into PATH, it's needed because gen_git_source.py uses
-# '/usr/bin/env python' as a shebang
-export PATH="/c/Program Files/Anaconda3:$PATH"
diff --git a/tensorflow/tools/dist_test/Dockerfile b/tensorflow/tools/dist_test/Dockerfile
index 65d7e1717e..83bbeeca8a 100644
--- a/tensorflow/tools/dist_test/Dockerfile
+++ b/tensorflow/tools/dist_test/Dockerfile
@@ -23,7 +23,7 @@ FROM ubuntu:16.04
 MAINTAINER Shanqing Cai <cais@google.com>
 
 RUN apt-get update
-RUN apt-get install -y --no-install-recommends \
+RUN apt-get install -y \
     curl \
     python \
     python-numpy \
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index dd18b61017..7bf7fd5719 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -82,7 +82,7 @@ RUN mkdir /bazel && \
 
 RUN git clone https://github.com/tensorflow/tensorflow.git && \
     cd tensorflow && \
-    git checkout r1.0
+    git checkout r1.1
 WORKDIR /tensorflow
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 8ead2f15ae..769731974a 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -82,7 +82,7 @@ RUN mkdir /bazel && \
 
 RUN git clone https://github.com/tensorflow/tensorflow.git && \
     cd tensorflow && \
-    git checkout r1.0
+    git checkout r1.1
 WORKDIR /tensorflow
 
 # Configure the build for our CUDA configuration.
diff --git a/tensorflow/tools/graph_transforms/README.md b/tensorflow/tools/graph_transforms/README.md
index 6597adb68a..06ae78ef5d 100644
--- a/tensorflow/tools/graph_transforms/README.md
+++ b/tensorflow/tools/graph_transforms/README.md
@@ -103,8 +103,8 @@ output layers of the model are. The best source for these is the model training
 process, where for a classifier the inputs will be the nodes that receive the
 data from the training set, and the output will be the predictions. If you're
 unsure, the
-[summarize_graph](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/graph_transforms/summarize_graph.cc)
-can inspect the model and provide guesses about likely input and output nodes,
+[summarize_graph](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/graph_transforms/summarize_graph_main.cc)
+tool can inspect the model and provide guesses about likely input and output nodes,
 as well as other information that's useful for debugging. Here's an example of
 how to use it on the [Inception V3
 graph](http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz):
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 376eaedc75..f591e50ac9 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,7 +29,7 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.0.1'
+_VERSION = '1.1.0-rc0'
 
 REQUIRED_PACKAGES = [
     'numpy >= 1.11.0',
@@ -167,15 +167,12 @@ headers = (list(find_files('*.h', 'tensorflow/core')) +
            list(find_files('*', 'third_party/eigen3')) +
            list(find_files('*', 'external/eigen_archive')))
 
-tf_long_description = (
-    'Note: TensorFlow manylinux1 wheels do not conform to the '
-    'specification in PEP531.')
 
 setup(
     name=project_name,
     version=_VERSION.replace('-', ''),
     description='TensorFlow helps the tensors flow',
-    long_description=tf_long_description,
+    long_description='',
     url='http://tensorflow.org/',
     author='Google Inc.',
     author_email='opensource@google.com',
diff --git a/tensorflow/tools/test/check_futures_test.py b/tensorflow/tools/test/check_futures_test.py
index 32d65adb1f..36a61c0ecc 100644
--- a/tensorflow/tools/test/check_futures_test.py
+++ b/tensorflow/tools/test/check_futures_test.py
@@ -40,6 +40,7 @@ FUTURES_PATTERN_2 = re.compile(
 REQUIRED_FUTURES = frozenset(['absolute_import', 'division', 'print_function'])
 
 WHITELIST = [
+    'python/platform/control_imports.py',
     'tools/docker/jupyter_notebook_config.py',
 ]
 
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index b4578d6860..7bcdb1613d 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -1,9 +1,11 @@
 # TensorFlow external dependencies that can be loaded in WORKSPACE files.
 
-load("@io_bazel_rules_closure//closure/private:java_import_external.bzl", "java_import_external")
+load("@io_bazel_rules_closure//closure/private:java_import_external.bzl",
+     "java_import_external")
 load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
 load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles_external")
 load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
+
 load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
 
 
@@ -14,20 +16,23 @@ def _parse_bazel_version(bazel_version):
 
   # Split into (release, date) parts and only return the release
   # as a tuple of integers.
-  parts = version.split('-', 1)
+  parts = version.split("-", 1)
 
   # Turn "release" into a tuple of strings
   version_tuple = ()
-  for number in parts[0].split('.'):
+  for number in parts[0].split("."):
     version_tuple += (str(number),)
   return version_tuple
 
+
 # Check that a specific bazel version is being used.
 def check_version(bazel_version):
   if "bazel_version" not in dir(native):
-    fail("\nCurrent Bazel version is lower than 0.2.1, expected at least %s\n" % bazel_version)
+    fail("\nCurrent Bazel version is lower than 0.2.1, expected at least %s\n" %
+         bazel_version)
   elif not native.bazel_version:
-    print("\nCurrent Bazel is not a release version, cannot check for compatibility.")
+    print("\nCurrent Bazel is not a release version, cannot check for " +
+          "compatibility.")
     print("Make sure that you are running at least Bazel %s.\n" % bazel_version)
   else:
     current_bazel_version = _parse_bazel_version(native.bazel_version)
@@ -37,523 +42,529 @@ def check_version(bazel_version):
           native.bazel_version, bazel_version))
   pass
 
+
 def _repos_are_siblings():
   return Label("@foo//bar").workspace_root.startswith("../")
 
+
 # Temporary workaround to support including TensorFlow as a submodule until this
 # use-case is supported in the next Bazel release.
 def _temp_workaround_http_archive_impl(repo_ctx):
-   repo_ctx.template("BUILD", repo_ctx.attr.build_file,
-                     {
-                         "%prefix%" : ".." if _repos_are_siblings() else "external",
-                         "%ws%": repo_ctx.attr.repository
-                     }, False)
-   repo_ctx.download_and_extract(repo_ctx.attr.urls, "", repo_ctx.attr.sha256,
-                                 "", repo_ctx.attr.strip_prefix)
-   if repo_ctx.attr.patch_file != None:
-     _apply_patch(repo_ctx, repo_ctx.attr.patch_file)
+  repo_ctx.template("BUILD", repo_ctx.attr.build_file, {
+      "%prefix%": ".." if _repos_are_siblings() else "external",
+      "%ws%": repo_ctx.attr.repository
+  }, False)
+  repo_ctx.download_and_extract(repo_ctx.attr.urls, "", repo_ctx.attr.sha256,
+                                "", repo_ctx.attr.strip_prefix)
+  if repo_ctx.attr.patch_file != None:
+    _apply_patch(repo_ctx, repo_ctx.attr.patch_file)
+
 
 temp_workaround_http_archive = repository_rule(
-   implementation=_temp_workaround_http_archive_impl,
-   attrs = {
-      "build_file": attr.label(),
-      "repository": attr.string(),
-      "patch_file": attr.label(default = None),
-      "urls": attr.string_list(default = []),
-      "sha256": attr.string(default = ""),
-      "strip_prefix": attr.string(default = ""),
-   })
+    implementation=_temp_workaround_http_archive_impl,
+    attrs={
+        "build_file": attr.label(),
+        "repository": attr.string(),
+        "patch_file": attr.label(default=None),
+        "urls": attr.string_list(default=[]),
+        "sha256": attr.string(default=""),
+        "strip_prefix": attr.string(default=""),
+    })
+
 
 # Executes specified command with arguments and calls 'fail' if it exited with non-zero code
 def _execute_and_check_ret_code(repo_ctx, cmd_and_args):
   result = repo_ctx.execute(cmd_and_args)
   if result.return_code != 0:
-    fail(("Non-zero return code({1}) when executing '{0}':\n" +
-          "Stdout: {2}\n" +
-          "Stderr: {3}").format(" ".join(cmd_and_args),
-                                result.return_code, result.stdout, result.stderr))
+    fail(("Non-zero return code({1}) when executing '{0}':\n" + "Stdout: {2}\n"
+          + "Stderr: {3}").format(" ".join(cmd_and_args), result.return_code,
+                                  result.stdout, result.stderr))
+
 
 # Apply a patch_file to the repository root directory
 # Runs 'patch -p1'
 def _apply_patch(repo_ctx, patch_file):
-  _execute_and_check_ret_code(repo_ctx, ["patch", "-p1",
-                                         "-d", repo_ctx.path("."),
-                                         "-i", repo_ctx.path(patch_file)])
+  _execute_and_check_ret_code(repo_ctx, [
+      "patch", "-p1", "-d", repo_ctx.path("."), "-i", repo_ctx.path(patch_file)
+  ])
+
 
 # Download the repository and apply a patch to its root
 def _patched_http_archive_impl(repo_ctx):
-  repo_ctx.download_and_extract(repo_ctx.attr.urls,
-                                sha256 = repo_ctx.attr.sha256,
-                                stripPrefix = repo_ctx.attr.strip_prefix)
+  repo_ctx.download_and_extract(
+      repo_ctx.attr.urls,
+      sha256=repo_ctx.attr.sha256,
+      stripPrefix=repo_ctx.attr.strip_prefix)
   _apply_patch(repo_ctx, repo_ctx.attr.patch_file)
 
+
 patched_http_archive = repository_rule(
-    implementation = _patched_http_archive_impl,
-    attrs = {
-      "patch_file": attr.label(),
-      "build_file": attr.label(),
-      "repository": attr.string(),
-      "urls": attr.string_list(default = []),
-      "sha256": attr.string(default = ""),
-      "strip_prefix": attr.string(default = ""),
+    implementation=_patched_http_archive_impl,
+    attrs={
+        "patch_file": attr.label(),
+        "build_file": attr.label(),
+        "repository": attr.string(),
+        "urls": attr.string_list(default=[]),
+        "sha256": attr.string(default=""),
+        "strip_prefix": attr.string(default=""),
     })
 
+
 # If TensorFlow is linked as a submodule.
 # path_prefix and tf_repo_name are no longer used.
-def tf_workspace(path_prefix = "", tf_repo_name = ""):
+def tf_workspace(path_prefix="", tf_repo_name=""):
   # We must check the bazel version before trying to parse any other BUILD
   # files, in case the parsing of those build files depends on the bazel
   # version we require here.
   check_version("0.4.5")
-  cuda_configure(name = "local_config_cuda")
-  sycl_configure(name = "local_config_sycl")
+  cuda_configure(name="local_config_cuda")
+  sycl_configure(name="local_config_sycl")
   if path_prefix:
-    print("path_prefix was specified to tf_workspace but is no longer used and will be removed in the future.")
+    print(
+        "path_prefix was specified to tf_workspace but is no longer used and " +
+        "will be removed in the future."
+    )
   if tf_repo_name:
-    print("tf_repo_name was specified to tf_workspace but is no longer used and will be removed in the future.")
+    print(
+        "tf_repo_name was specified to tf_workspace but is no longer used " +
+        "and will be removed in the future."
+    )
 
   native.new_http_archive(
-      name = "eigen_archive",
-      urls = [
+      name="eigen_archive",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/bitbucket.org/eigen/eigen/get/deff8b280204.tar.gz",
           "https://bitbucket.org/eigen/eigen/get/deff8b280204.tar.gz",
       ],
-      sha256 = "a39834683eb5bdb9a7434f0ab3621d2cbc3b07e8002db6de101e45ec536723eb",
-      strip_prefix = "eigen-eigen-deff8b280204",
-      build_file = str(Label("//third_party:eigen.BUILD")),
-  )
+      sha256="a39834683eb5bdb9a7434f0ab3621d2cbc3b07e8002db6de101e45ec536723eb",
+      strip_prefix="eigen-eigen-deff8b280204",
+      build_file=str(Label("//third_party:eigen.BUILD")),)
 
   native.new_http_archive(
-      name = "libxsmm_archive",
-      urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/hfp/libxsmm/archive/1.7.1.tar.gz",
-          "https://github.com/hfp/libxsmm/archive/1.7.1.tar.gz",
+      name="libxsmm_archive",
+      urls=[
+          "http://bazel-mirror.storage.googleapis.com/github.com/hfp/libxsmm/archive/1.8.tar.gz",
+          "https://github.com/hfp/libxsmm/archive/1.8.tar.gz",
       ],
-      sha256 = "9d3f63ce3eed62f04e4036de6f2be2ce0ff07781ca571af6e0bf85b077edf17a",
-      strip_prefix = "libxsmm-1.7.1",
-      build_file = str(Label("//third_party:libxsmm.BUILD")),
-  )
+      sha256="0330201afb5525d0950ec861fec9dd75eb40a03845ebe03d2c635cf8bfc14fea",
+      strip_prefix="libxsmm-1.8",
+      build_file=str(Label("//third_party:libxsmm.BUILD")),)
 
   native.bind(
-      name = "xsmm_avx",
-      actual = "@libxsmm_archive//third_party:xsmm_avx",
-  )
+      name="xsmm_avx",
+      actual="@libxsmm_archive//third_party:xsmm_avx",)
+
+  native.new_http_archive(
+      name="ortools_archive",
+      urls=[
+          "http://bazel-mirror.storage.googleapis.com/github.com/google/or-tools/archive/253f7955c6a1fd805408fba2e42ac6d45b312d15.tar.gz",
+          "https://github.com/google/or-tools/archive/253f7955c6a1fd805408fba2e42ac6d45b312d15.tar.gz",
+      ],
+      sha256="932075525642b04ac6f1b50589f1df5cd72ec2f448b721fd32234cf183f0e755",
+      strip_prefix="or-tools-253f7955c6a1fd805408fba2e42ac6d45b312d15/src",
+      build_file=str(Label("//third_party:ortools.BUILD")),)
 
   native.http_archive(
-      name = "com_googlesource_code_re2",
-      urls = [
+      name="com_googlesource_code_re2",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz",
           "https://github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz",
       ],
-      sha256 = "bd63550101e056427c9e7ff12a408c1c8b74e9803f393ca916b2926fc2c4906f",
-      strip_prefix = "re2-b94b7cd42e9f02673cd748c1ac1d16db4052514c",
-  )
+      sha256="bd63550101e056427c9e7ff12a408c1c8b74e9803f393ca916b2926fc2c4906f",
+      strip_prefix="re2-b94b7cd42e9f02673cd748c1ac1d16db4052514c",)
 
   native.http_archive(
-      name = "gemmlowp",
-      urls = [
+      name="gemmlowp",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/google/gemmlowp/archive/a6f29d8ac48d63293f845f2253eccbf86bc28321.tar.gz",
           "https://github.com/google/gemmlowp/archive/a6f29d8ac48d63293f845f2253eccbf86bc28321.tar.gz",
       ],
-      sha256 = "75d40ea8e68b0d1644f052fffe8f14a410b2a73d40ccb859a95c0578d194ec26",
-      strip_prefix = "gemmlowp-a6f29d8ac48d63293f845f2253eccbf86bc28321",
-  )
+      sha256="75d40ea8e68b0d1644f052fffe8f14a410b2a73d40ccb859a95c0578d194ec26",
+      strip_prefix="gemmlowp-a6f29d8ac48d63293f845f2253eccbf86bc28321",)
 
   native.new_http_archive(
-      name = "farmhash_archive",
-      urls = [
+      name="farmhash_archive",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/google/farmhash/archive/92e897b282426729f4724d91a637596c7e2fe28f.zip",
           "https://github.com/google/farmhash/archive/92e897b282426729f4724d91a637596c7e2fe28f.zip",
       ],
-      sha256 = "4c626d1f306bda2c6804ab955892f803f5245f4dcaecb4979dc08b091256da54",
-      strip_prefix = "farmhash-92e897b282426729f4724d91a637596c7e2fe28f",
-      build_file = str(Label("//third_party:farmhash.BUILD")),
-  )
+      sha256="4c626d1f306bda2c6804ab955892f803f5245f4dcaecb4979dc08b091256da54",
+      strip_prefix="farmhash-92e897b282426729f4724d91a637596c7e2fe28f",
+      build_file=str(Label("//third_party:farmhash.BUILD")),)
 
   native.bind(
-      name = "farmhash",
-      actual = "@farmhash//:farmhash",
-  )
+      name="farmhash",
+      actual="@farmhash//:farmhash",)
 
   native.new_http_archive(
-      name = "highwayhash",
-      urls = [
+      name="highwayhash",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
           "https://github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
       ],
-      sha256 = "0f30a15b1566d93f146c8d149878a06e91d9bb7ec2cfd76906df62a82be4aac9",
-      strip_prefix = "highwayhash-dfcb97ca4fe9277bf9dc1802dd979b071896453b",
-      build_file = str(Label("//third_party:highwayhash.BUILD")),
-  )
+      sha256="0f30a15b1566d93f146c8d149878a06e91d9bb7ec2cfd76906df62a82be4aac9",
+      strip_prefix="highwayhash-dfcb97ca4fe9277bf9dc1802dd979b071896453b",
+      build_file=str(Label("//third_party:highwayhash.BUILD")),)
 
   native.new_http_archive(
-      name = "nasm",
-      urls = [
+      name="nasm",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/www.nasm.us/pub/nasm/releasebuilds/2.12.02/nasm-2.12.02.tar.bz2",
           "http://pkgs.fedoraproject.org/repo/pkgs/nasm/nasm-2.12.02.tar.bz2/d15843c3fb7db39af80571ee27ec6fad/nasm-2.12.02.tar.bz2",
       ],
-      sha256 = "00b0891c678c065446ca59bcee64719d0096d54d6886e6e472aeee2e170ae324",
-      strip_prefix = "nasm-2.12.02",
-      build_file = str(Label("//third_party:nasm.BUILD")),
-  )
+      sha256="00b0891c678c065446ca59bcee64719d0096d54d6886e6e472aeee2e170ae324",
+      strip_prefix="nasm-2.12.02",
+      build_file=str(Label("//third_party:nasm.BUILD")),)
 
   temp_workaround_http_archive(
-      name = "jpeg",
-      urls = [
+      name="jpeg",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
           "https://github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
       ],
-      sha256 = "c15a9607892113946379ccea3ca8b85018301b200754f209453ab21674268e77",
-      strip_prefix = "libjpeg-turbo-1.5.1",
-      build_file = str(Label("//third_party/jpeg:jpeg.BUILD")),
-      repository = tf_repo_name,
-  )
+      sha256="c15a9607892113946379ccea3ca8b85018301b200754f209453ab21674268e77",
+      strip_prefix="libjpeg-turbo-1.5.1",
+      build_file=str(Label("//third_party/jpeg:jpeg.BUILD")),
+      repository=tf_repo_name,)
 
   native.new_http_archive(
-      name = "png_archive",
-      urls = [
+      name="png_archive",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/glennrp/libpng/archive/v1.2.53.zip",
           "https://github.com/glennrp/libpng/archive/v1.2.53.zip",
       ],
-      sha256 = "c35bcc6387495ee6e757507a68ba036d38ad05b415c2553b3debe2a57647a692",
-      strip_prefix = "libpng-1.2.53",
-      build_file = str(Label("//third_party:png.BUILD")),
-  )
+      sha256="c35bcc6387495ee6e757507a68ba036d38ad05b415c2553b3debe2a57647a692",
+      strip_prefix="libpng-1.2.53",
+      build_file=str(Label("//third_party:png.BUILD")),)
 
   native.new_http_archive(
-      name = "gif_archive",
-      urls = [
+      name="gif_archive",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
           "http://ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
           "http://pilotfiber.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
       ],
-      sha256 = "34a7377ba834397db019e8eb122e551a49c98f49df75ec3fcc92b9a794a4f6d1",
-      strip_prefix = "giflib-5.1.4",
-      build_file = str(Label("//third_party:gif.BUILD")),
-  )
+      sha256="34a7377ba834397db019e8eb122e551a49c98f49df75ec3fcc92b9a794a4f6d1",
+      strip_prefix="giflib-5.1.4",
+      build_file=str(Label("//third_party:gif.BUILD")),)
 
   native.new_http_archive(
-      name = "six_archive",
-      urls = [
+      name="six_archive",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
           "http://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
       ],
-      sha256 = "105f8d68616f8248e24bf0e9372ef04d3cc10104f1980f54d57b2ce73a5ad56a",
-      strip_prefix = "six-1.10.0",
-      build_file = str(Label("//third_party:six.BUILD")),
-  )
+      sha256="105f8d68616f8248e24bf0e9372ef04d3cc10104f1980f54d57b2ce73a5ad56a",
+      strip_prefix="six-1.10.0",
+      build_file=str(Label("//third_party:six.BUILD")),)
 
   native.new_http_archive(
-      name = "org_pythonhosted_markdown",
-      urls = [
+      name="org_pythonhosted_markdown",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/pypi.python.org/packages/1d/25/3f6d2cb31ec42ca5bd3bfbea99b63892b735d76e26f20dd2dcc34ffe4f0d/Markdown-2.6.8.tar.gz",
           "https://pypi.python.org/packages/1d/25/3f6d2cb31ec42ca5bd3bfbea99b63892b735d76e26f20dd2dcc34ffe4f0d/Markdown-2.6.8.tar.gz",
       ],
-      strip_prefix = "Markdown-2.6.8",
-      sha256 = "0ac8a81e658167da95d063a9279c9c1b2699f37c7c4153256a458b3a43860e33",
-      build_file = str(Label("//third_party:markdown.BUILD")),
-  )
+      strip_prefix="Markdown-2.6.8",
+      sha256="0ac8a81e658167da95d063a9279c9c1b2699f37c7c4153256a458b3a43860e33",
+      build_file=str(Label("//third_party:markdown.BUILD")),)
 
   native.new_http_archive(
-      name = "org_html5lib",
-      urls = [
+      name="org_html5lib",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/html5lib/html5lib-python/archive/1.0b8.tar.gz",
           "https://github.com/html5lib/html5lib-python/archive/1.0b8.tar.gz",
       ],
-      sha256 = "adb36c879264e8880b92589c4c4fe0814cd9d157b73328b14d728f48a6bab0a4",
-      strip_prefix = "html5lib-python-1.0b8",
-      build_file = str(Label("//third_party:html5lib.BUILD")),
-  )
+      sha256="adb36c879264e8880b92589c4c4fe0814cd9d157b73328b14d728f48a6bab0a4",
+      strip_prefix="html5lib-python-1.0b8",
+      build_file=str(Label("//third_party:html5lib.BUILD")),)
 
   native.new_http_archive(
-      name = "org_mozilla_bleach",
-      urls = [
+      name="org_mozilla_bleach",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/mozilla/bleach/archive/v1.5.tar.gz",
           "https://github.com/mozilla/bleach/archive/v1.5.tar.gz",
       ],
-      strip_prefix = "bleach-1.5",
-      sha256 = "0d68713d02ba4148c417ab1637dd819333d96929a34401d0233947bec0881ad8",
-      build_file = str(Label("//third_party:bleach.BUILD")),
-  )
+      strip_prefix="bleach-1.5",
+      sha256="0d68713d02ba4148c417ab1637dd819333d96929a34401d0233947bec0881ad8",
+      build_file=str(Label("//third_party:bleach.BUILD")),)
 
   native.new_http_archive(
-      name = "org_pocoo_werkzeug",
-      urls = [
+      name="org_pocoo_werkzeug",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/pypi.python.org/packages/b7/7f/44d3cfe5a12ba002b253f6985a4477edfa66da53787a2a838a40f6415263/Werkzeug-0.11.10.tar.gz",
           "https://pypi.python.org/packages/b7/7f/44d3cfe5a12ba002b253f6985a4477edfa66da53787a2a838a40f6415263/Werkzeug-0.11.10.tar.gz",
       ],
-      strip_prefix = "Werkzeug-0.11.10",
-      sha256 = "cc64dafbacc716cdd42503cf6c44cb5a35576443d82f29f6829e5c49264aeeee",
-      build_file = str(Label("//third_party:werkzeug.BUILD")),
-  )
+      strip_prefix="Werkzeug-0.11.10",
+      sha256="cc64dafbacc716cdd42503cf6c44cb5a35576443d82f29f6829e5c49264aeeee",
+      build_file=str(Label("//third_party:werkzeug.BUILD")),)
 
   native.bind(
-      name = "six",
-      actual = "@six_archive//:six",
-  )
+      name="six",
+      actual="@six_archive//:six",)
 
   patched_http_archive(
-      name = "protobuf",
-      urls = [
+      name="protobuf",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
           "https://github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
       ],
-      sha256 = "e5d3d4e227a0f7afb8745df049bbd4d55474b158ca5aaa2a0e31099af24be1d0",
-      strip_prefix = "protobuf-2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a",
+      sha256="e5d3d4e227a0f7afb8745df049bbd4d55474b158ca5aaa2a0e31099af24be1d0",
+      strip_prefix="protobuf-2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a",
       # TODO: remove patching when tensorflow stops linking same protos into
       #       multiple shared libraries loaded in runtime by python.
       #       This patch fixes a runtime crash when tensorflow is compiled
       #       with clang -O2 on Linux (see https://github.com/tensorflow/tensorflow/issues/8394)
-      patch_file = str(Label("//third_party/protobuf:add_noinlines.patch")),
-  )
+      patch_file=str(Label("//third_party/protobuf:add_noinlines.patch")),)
+
+  # We need to import the protobuf library under the names com_google_protobuf
+  # and com_google_protobuf_cc to enable proto_library support in bazel.
+  # Unfortunately there is no way to alias http_archives at the moment.
+  native.http_archive(
+      name="com_google_protobuf",
+      urls=[
+          "http://bazel-mirror.storage.googleapis.com/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
+          "https://github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
+      ],
+      sha256="e5d3d4e227a0f7afb8745df049bbd4d55474b158ca5aaa2a0e31099af24be1d0",
+      strip_prefix="protobuf-2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a",)
+
+  native.http_archive(
+      name="com_google_protobuf_cc",
+      urls=[
+          "http://bazel-mirror.storage.googleapis.com/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
+          "https://github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
+      ],
+      sha256="e5d3d4e227a0f7afb8745df049bbd4d55474b158ca5aaa2a0e31099af24be1d0",
+      strip_prefix="protobuf-2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a",)
 
   native.new_http_archive(
-      name = "gmock_archive",
-      urls = [
+      name="gmock_archive",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/google/googletest/archive/release-1.8.0.zip",
           "https://github.com/google/googletest/archive/release-1.8.0.zip",
       ],
-      sha256 = "f3ed3b58511efd272eb074a3a6d6fb79d7c2e6a0e374323d1e6bcbcc1ef141bf",
-      strip_prefix = "googletest-release-1.8.0",
-      build_file = str(Label("//third_party:gmock.BUILD")),
-  )
+      sha256="f3ed3b58511efd272eb074a3a6d6fb79d7c2e6a0e374323d1e6bcbcc1ef141bf",
+      strip_prefix="googletest-release-1.8.0",
+      build_file=str(Label("//third_party:gmock.BUILD")),)
 
   native.bind(
-      name = "gtest",
-      actual = "@gmock_archive//:gtest",
-  )
+      name="gtest",
+      actual="@gmock_archive//:gtest",)
 
   native.bind(
-      name = "gtest_main",
-      actual = "@gmock_archive//:gtest_main",
-  )
+      name="gtest_main",
+      actual="@gmock_archive//:gtest_main",)
+
+  native.git_repository(
+      name="com_github_gflags_gflags",
+      commit="f8a0efe03aa69b3336d8e228b37d4ccb17324b88",
+      remote="https://github.com/gflags/gflags.git",)
 
   native.bind(
-      name = "python_headers",
-      actual = str(Label("//util/python:python_headers")),
-  )
+      name="python_headers",
+      actual=str(Label("//util/python:python_headers")),)
 
   native.new_http_archive(
-      name = "pcre",
-      sha256 = "ccdf7e788769838f8285b3ee672ed573358202305ee361cfec7a4a4fb005bbc7",
-      urls = [
+      name="pcre",
+      sha256="ccdf7e788769838f8285b3ee672ed573358202305ee361cfec7a4a4fb005bbc7",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/ftp.exim.org/pub/pcre/pcre-8.39.tar.gz",
           "http://ftp.exim.org/pub/pcre/pcre-8.39.tar.gz",
       ],
-      strip_prefix = "pcre-8.39",
-      build_file = str(Label("//third_party:pcre.BUILD")),
-  )
+      strip_prefix="pcre-8.39",
+      build_file=str(Label("//third_party:pcre.BUILD")),)
 
   native.new_http_archive(
-      name = "swig",
-      sha256 = "58a475dbbd4a4d7075e5fe86d4e54c9edde39847cdb96a3053d87cb64a23a453",
-      urls = [
+      name="swig",
+      sha256="58a475dbbd4a4d7075e5fe86d4e54c9edde39847cdb96a3053d87cb64a23a453",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
           "http://ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
           "http://pilotfiber.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
       ],
-      strip_prefix = "swig-3.0.8",
-      build_file = str(Label("//third_party:swig.BUILD")),
-  )
+      strip_prefix="swig-3.0.8",
+      build_file=str(Label("//third_party:swig.BUILD")),)
 
   temp_workaround_http_archive(
-      name = "curl",
-      sha256 = "ff3e80c1ca6a068428726cd7dd19037a47cc538ce58ef61c59587191039b2ca6",
-      urls = [
+      name="curl",
+      sha256="ff3e80c1ca6a068428726cd7dd19037a47cc538ce58ef61c59587191039b2ca6",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/curl.haxx.se/download/curl-7.49.1.tar.gz",
           "https://curl.haxx.se/download/curl-7.49.1.tar.gz",
       ],
-      strip_prefix = "curl-7.49.1",
-      build_file = str(Label("//third_party:curl.BUILD")),
-      repository = tf_repo_name
-  )
+      strip_prefix="curl-7.49.1",
+      build_file=str(Label("//third_party:curl.BUILD")),
+      repository=tf_repo_name)
 
   # grpc expects //external:protobuf_clib and //external:protobuf_compiler
   # to point to the protobuf's compiler library.
   native.bind(
-      name = "protobuf_clib",
-      actual = "@protobuf//:protoc_lib",
-  )
+      name="protobuf_clib",
+      actual="@protobuf//:protoc_lib",)
 
   native.bind(
-      name = "protobuf_compiler",
-      actual = "@protobuf//:protoc_lib",
-  )
+      name="protobuf_compiler",
+      actual="@protobuf//:protoc_lib",)
 
   native.new_http_archive(
-      name = "grpc",
-      urls = [
+      name="grpc",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/grpc/grpc/archive/d7ff4ff40071d2b486a052183e3e9f9382afb745.tar.gz",
           "https://github.com/grpc/grpc/archive/d7ff4ff40071d2b486a052183e3e9f9382afb745.tar.gz",
       ],
-      sha256 = "a15f352436ab92c521b1ac11e729e155ace38d0856380cf25048c5d1d9ba8e31",
-      strip_prefix = "grpc-d7ff4ff40071d2b486a052183e3e9f9382afb745",
-      build_file = str(Label("//third_party:grpc.BUILD")),
-  )
+      sha256="a15f352436ab92c521b1ac11e729e155ace38d0856380cf25048c5d1d9ba8e31",
+      strip_prefix="grpc-d7ff4ff40071d2b486a052183e3e9f9382afb745",
+      build_file=str(Label("//third_party:grpc.BUILD")),)
 
   # protobuf expects //external:grpc_cpp_plugin to point to grpc's
   # C++ plugin code generator.
   native.bind(
-      name = "grpc_cpp_plugin",
-      actual = "@grpc//:grpc_cpp_plugin",
-  )
+      name="grpc_cpp_plugin",
+      actual="@grpc//:grpc_cpp_plugin",)
 
   native.bind(
-      name = "grpc_lib",
-      actual = "@grpc//:grpc++_unsecure",
-  )
+      name="grpc_lib",
+      actual="@grpc//:grpc++_unsecure",)
 
   native.new_http_archive(
-      name = "linenoise",
-      sha256 = "7f51f45887a3d31b4ce4fa5965210a5e64637ceac12720cfce7954d6a2e812f7",
-      urls = [
+      name="linenoise",
+      sha256="7f51f45887a3d31b4ce4fa5965210a5e64637ceac12720cfce7954d6a2e812f7",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
           "https://github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
       ],
-      strip_prefix = "linenoise-c894b9e59f02203dbe4e2be657572cf88c4230c3",
-      build_file = str(Label("//third_party:linenoise.BUILD")),
-  )
+      strip_prefix="linenoise-c894b9e59f02203dbe4e2be657572cf88c4230c3",
+      build_file=str(Label("//third_party:linenoise.BUILD")),)
 
   # TODO(phawkins): currently, this rule uses an unofficial LLVM mirror.
   # Switch to an official source of snapshots if/when possible.
   temp_workaround_http_archive(
-      name = "llvm",
-      urls = [
+      name="llvm",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/llvm-mirror/llvm/archive/5d2b26453d4bca5a13b69b0130e4369d1fcd393d.tar.gz",
           "https://github.com/llvm-mirror/llvm/archive/5d2b26453d4bca5a13b69b0130e4369d1fcd393d.tar.gz",
       ],
-      sha256 = "3cecf39bf4b3854629d610bb321bb57e0e46bda9110bd51c3bae5a4171c82bab",
-      strip_prefix = "llvm-5d2b26453d4bca5a13b69b0130e4369d1fcd393d",
-      build_file = str(Label("//third_party/llvm:llvm.BUILD")),
-      repository = tf_repo_name,
-  )
+      sha256="3cecf39bf4b3854629d610bb321bb57e0e46bda9110bd51c3bae5a4171c82bab",
+      strip_prefix="llvm-5d2b26453d4bca5a13b69b0130e4369d1fcd393d",
+      build_file=str(Label("//third_party/llvm:llvm.BUILD")),
+      repository=tf_repo_name,)
 
   native.new_http_archive(
-      name = "jsoncpp_git",
-      urls = [
+      name="jsoncpp_git",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
           "https://github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
       ],
-      sha256 = "07d34db40593d257324ec5fb9debc4dc33f29f8fb44e33a2eeb35503e61d0fe2",
-      strip_prefix = "jsoncpp-11086dd6a7eba04289944367ca82cea71299ed70",
-      build_file = str(Label("//third_party:jsoncpp.BUILD")),
-  )
+      sha256="07d34db40593d257324ec5fb9debc4dc33f29f8fb44e33a2eeb35503e61d0fe2",
+      strip_prefix="jsoncpp-11086dd6a7eba04289944367ca82cea71299ed70",
+      build_file=str(Label("//third_party:jsoncpp.BUILD")),)
 
   native.bind(
-      name = "jsoncpp",
-      actual = "@jsoncpp_git//:jsoncpp",
-  )
+      name="jsoncpp",
+      actual="@jsoncpp_git//:jsoncpp",)
 
   native.http_archive(
-      name = "boringssl",
-      urls = [
+      name="boringssl",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/google/boringssl/archive/bbcaa15b0647816b9a1a9b9e0d209cd6712f0105.tar.gz",
           "https://github.com/google/boringssl/archive/bbcaa15b0647816b9a1a9b9e0d209cd6712f0105.tar.gz",  # 2016-07-11
       ],
-      sha256 = "025264d6e9a7ad371f2f66d17a28b6627de0c9592dc2eb54afd062f68f1f9aa3",
-      strip_prefix = "boringssl-bbcaa15b0647816b9a1a9b9e0d209cd6712f0105",
-  )
+      sha256="025264d6e9a7ad371f2f66d17a28b6627de0c9592dc2eb54afd062f68f1f9aa3",
+      strip_prefix="boringssl-bbcaa15b0647816b9a1a9b9e0d209cd6712f0105",)
 
   native.new_http_archive(
-      name = "nanopb_git",
-      urls = [
+      name="nanopb_git",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/nanopb/nanopb/archive/1251fa1065afc0d62f635e0f63fec8276e14e13c.tar.gz",
           "https://github.com/nanopb/nanopb/archive/1251fa1065afc0d62f635e0f63fec8276e14e13c.tar.gz",
       ],
-      sha256 = "ab1455c8edff855f4f55b68480991559e51c11e7dab060bbab7cffb12dd3af33",
-      strip_prefix = "nanopb-1251fa1065afc0d62f635e0f63fec8276e14e13c",
-      build_file = str(Label("//third_party:nanopb.BUILD")),
-  )
+      sha256="ab1455c8edff855f4f55b68480991559e51c11e7dab060bbab7cffb12dd3af33",
+      strip_prefix="nanopb-1251fa1065afc0d62f635e0f63fec8276e14e13c",
+      build_file=str(Label("//third_party:nanopb.BUILD")),)
 
   native.bind(
-      name = "nanopb",
-      actual = "@nanopb_git//:nanopb",
-  )
+      name="nanopb",
+      actual="@nanopb_git//:nanopb",)
 
   native.new_http_archive(
-      name = "zlib_archive",
-      urls = [
+      name="zlib_archive",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/zlib.net/zlib-1.2.8.tar.gz",
           "http://zlib.net/fossils/zlib-1.2.8.tar.gz",
       ],
-      sha256 = "36658cb768a54c1d4dec43c3116c27ed893e88b02ecfcb44f2166f9c0b7f2a0d",
-      strip_prefix = "zlib-1.2.8",
-      build_file = str(Label("//third_party:zlib.BUILD")),
-  )
+      sha256="36658cb768a54c1d4dec43c3116c27ed893e88b02ecfcb44f2166f9c0b7f2a0d",
+      strip_prefix="zlib-1.2.8",
+      build_file=str(Label("//third_party:zlib.BUILD")),)
 
   native.bind(
-      name = "zlib",
-      actual = "@zlib_archive//:zlib",
-  )
+      name="zlib",
+      actual="@zlib_archive//:zlib",)
 
   temp_workaround_http_archive(
-      name = "snappy",
-      urls = [
+      name="snappy",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/google/snappy/archive/1.1.4.zip",
           "https://github.com/google/snappy/archive/1.1.4.zip",
       ],
-      sha256 = "6c74d2b663170d68184da353cdd71b5b7d57bc8888ef1e99b4929b5d680dba54",
-      strip_prefix = "snappy-1.1.4",
-      build_file = str(Label("//third_party:snappy.BUILD")),
-      repository = tf_repo_name,
-  )
+      sha256="6c74d2b663170d68184da353cdd71b5b7d57bc8888ef1e99b4929b5d680dba54",
+      strip_prefix="snappy-1.1.4",
+      build_file=str(Label("//third_party:snappy.BUILD")),
+      repository=tf_repo_name,)
 
   temp_workaround_http_archive(
-      name = "nccl_archive",
-      urls = [
+      name="nccl_archive",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/nvidia/nccl/archive/024d1e267845f2ed06f3e2e42476d50f04a00ee6.tar.gz",
           "https://github.com/nvidia/nccl/archive/024d1e267845f2ed06f3e2e42476d50f04a00ee6.tar.gz",
       ],
-      sha256 = "6787f0eed88d52ee8e32956fa4947d92c139da469f1d8e311c307f27d641118e",
-      strip_prefix = "nccl-024d1e267845f2ed06f3e2e42476d50f04a00ee6",
-      build_file = str(Label("//third_party/nccl:nccl.BUILD")),
+      sha256="6787f0eed88d52ee8e32956fa4947d92c139da469f1d8e311c307f27d641118e",
+      strip_prefix="nccl-024d1e267845f2ed06f3e2e42476d50f04a00ee6",
+      build_file=str(Label("//third_party/nccl:nccl.BUILD")),
       # TODO: Remove patching after the fix is merged into nccl(see https://github.com/NVIDIA/nccl/pull/78)
-      patch_file = str(Label("//third_party/nccl:fix_clang_compilation.patch")),
-      repository = tf_repo_name,
-  )
+      patch_file=str(Label("//third_party/nccl:fix_clang_compilation.patch")),
+      repository=tf_repo_name,)
 
   java_import_external(
-      name = "junit",
-      jar_sha256 = "59721f0805e223d84b90677887d9ff567dc534d7c502ca903c0c2b17f05c116a",
-      jar_urls = [
+      name="junit",
+      jar_sha256=
+      "59721f0805e223d84b90677887d9ff567dc534d7c502ca903c0c2b17f05c116a",
+      jar_urls=[
           "http://bazel-mirror.storage.googleapis.com/repo1.maven.org/maven2/junit/junit/4.12/junit-4.12.jar",
           "http://repo1.maven.org/maven2/junit/junit/4.12/junit-4.12.jar",
           "http://maven.ibiblio.org/maven2/junit/junit/4.12/junit-4.12.jar",
       ],
-      licenses = ["reciprocal"],  # Common Public License Version 1.0
-      testonly_ = True,
-      deps = ["@org_hamcrest_core"],
-  )
+      licenses=["reciprocal"],  # Common Public License Version 1.0
+      testonly_=True,
+      deps=["@org_hamcrest_core"],)
 
   java_import_external(
-      name = "org_hamcrest_core",
-      jar_sha256 = "66fdef91e9739348df7a096aa384a5685f4e875584cce89386a7a47251c4d8e9",
-      jar_urls = [
+      name="org_hamcrest_core",
+      jar_sha256=
+      "66fdef91e9739348df7a096aa384a5685f4e875584cce89386a7a47251c4d8e9",
+      jar_urls=[
           "http://bazel-mirror.storage.googleapis.com/repo1.maven.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar",
           "http://repo1.maven.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar",
           "http://maven.ibiblio.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar",
       ],
-      licenses = ["notice"],  # New BSD License
-      testonly_ = True,
-  )
+      licenses=["notice"],  # New BSD License
+      testonly_=True,)
 
   temp_workaround_http_archive(
-      name = "jemalloc",
-      urls = [
+      name="jemalloc",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
           "https://github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
       ],
-      sha256 = "3c8f25c02e806c3ce0ab5fb7da1817f89fc9732709024e2a81b6b82f7cc792a8",
-      strip_prefix = "jemalloc-4.4.0",
-      build_file = str(Label("//third_party:jemalloc.BUILD")),
-      repository = tf_repo_name,
-  )
+      sha256="3c8f25c02e806c3ce0ab5fb7da1817f89fc9732709024e2a81b6b82f7cc792a8",
+      strip_prefix="jemalloc-4.4.0",
+      build_file=str(Label("//third_party:jemalloc.BUILD")),
+      repository=tf_repo_name,)
 
   ##############################################################################
   # TensorBoard Build Tools
 
   filegroup_external(
-      name = "org_nodejs",
+      name="org_nodejs",
       # MIT with portions licensed:
       # - MIT
       # - Old MIT
@@ -563,14 +574,14 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       # - Unicode
       # - zlib
       # - Artistic 2.0
-      licenses = ["notice"],
-      sha256_urls_extract_macos = {
+      licenses=["notice"],
+      sha256_urls_extract_macos={
           "47109a00cac344d80296c195451bb5eee7c21727fcef1594384ddfe1f852957a": [
               "http://bazel-mirror.storage.googleapis.com/nodejs.org/dist/v4.3.2/node-v4.3.2-darwin-x64.tar.xz",
               "http://nodejs.org/dist/v4.3.2/node-v4.3.2-darwin-x64.tar.xz",
           ],
       },
-      sha256_urls_windows = {
+      sha256_urls_windows={
           "606c44c42d17866c017c50c0afadad411d9492ac4281d2431b937f881911614e": [
               "http://bazel-mirror.storage.googleapis.com/nodejs.org/dist/v4.3.2/win-x64/node.exe",
               "http://nodejs.org/dist/v4.3.2/win-x64/node.exe",
@@ -580,26 +591,25 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
               "http://nodejs.org/dist/v4.3.2/win-x64/node.lib",
           ],
       },
-      sha256_urls_extract = {
+      sha256_urls_extract={
           "4350d0431b49697517c6cca5d66adf5f74eb9101c52f52ae959fa94225822d44": [
               "http://bazel-mirror.storage.googleapis.com/nodejs.org/dist/v4.3.2/node-v4.3.2-linux-x64.tar.xz",
               "http://nodejs.org/dist/v4.3.2/node-v4.3.2-linux-x64.tar.xz",
           ],
       },
-      strip_prefix = {
+      strip_prefix={
           "node-v4.3.2-darwin-x64.tar.xz": "node-v4.3.2-darwin-x64",
           "node-v4.3.2-linux-x64.tar.xz": "node-v4.3.2-linux-x64",
       },
-      executable = [
+      executable=[
           "node",
           "node.exe",
-      ],
-  )
+      ],)
 
   filegroup_external(
-      name = "com_microsoft_typescript",
-      licenses = ["notice"],  # Apache 2.0
-      sha256_urls = {
+      name="com_microsoft_typescript",
+      licenses=["notice"],  # Apache 2.0
+      sha256_urls={
           "e3d9e320a2cae99be4aaa37953961a48323cdf16ba9aa2557a44d69571cd9b8d": [
               "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/Microsoft/TypeScript/v2.1.6/lib/tsc.js",
               "https://raw.githubusercontent.com/Microsoft/TypeScript/v2.1.6/lib/tsc.js",
@@ -609,7 +619,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
               "https://raw.githubusercontent.com/Microsoft/TypeScript/v2.1.6/lib/lib.es6.d.ts",
           ],
       },
-      extra_build_file_content = "\n".join([
+      extra_build_file_content="\n".join([
           "sh_binary(",
           "    name = \"tsc\",",
           "    srcs = [\"tsc.sh\"],",
@@ -632,40 +642,37 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
           "          \"EOF\",",
           "    executable = True,",
           ")",
-      ]),
-  )
+      ]),)
 
   ##############################################################################
   # TensorBoard JavaScript Production Dependencies
 
   filegroup_external(
-      name = "com_lodash",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
+      name="com_lodash",
+      licenses=["notice"],  # MIT
+      sha256_urls={
           "7c7b391810bc08cf815683431857c51b5ee190062ae4f557e1e4689d6dd910ea": [
               "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/lodash/lodash/3.8.0/lodash.js",
               "https://raw.githubusercontent.com/lodash/lodash/3.8.0/lodash.js",
           ],
-      },
-  )
+      },)
 
   filegroup_external(
-      name = "com_numericjs",
+      name="com_numericjs",
       # no @license header
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
+      licenses=["notice"],  # MIT
+      sha256_urls={
           "dfaca3b8485bee735788cc6eebca82ea25719adc1fb8911c7799c6bd5a95df3b": [
               "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/sloisel/numeric/v1.2.6/src/numeric.js",
               "https://raw.githubusercontent.com/sloisel/numeric/v1.2.6/src/numeric.js",
           ],
-      },
-  )
+      },)
 
   filegroup_external(
-      name = "com_palantir_plottable",
+      name="com_palantir_plottable",
       # no @license header
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
+      licenses=["notice"],  # MIT
+      sha256_urls={
           "77510d7538dbd3b59f1c8a06f68131b38562e3be546364747618d5112723e818": [
               "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.css",
               "https://raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.css",
@@ -678,61 +685,56 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
               "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.js",
               "https://raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.js",
           ],
-      },
-  )
+      },)
 
   filegroup_external(
-      name = "io_github_cpettitt_dagre",
+      name="io_github_cpettitt_dagre",
       # no @license header
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
+      licenses=["notice"],  # MIT
+      sha256_urls={
           "7323829ddd77924a69e2b1235ded3eac30acd990da0f037e0fbd3c8e9035b50d": [
               "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/cpettitt/dagre/v0.7.4/dist/dagre.core.js",
               "https://raw.githubusercontent.com/cpettitt/dagre/v0.7.4/dist/dagre.core.js",
           ],
-      },
-  )
+      },)
 
   filegroup_external(
-      name = "io_github_cpettitt_graphlib",
+      name="io_github_cpettitt_graphlib",
       # no @license header
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
+      licenses=["notice"],  # MIT
+      sha256_urls={
           "772045d412b1513b549be991c2e1846c38019429d43974efcae943fbe83489bf": [
               "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/cpettitt/graphlib/v1.0.7/dist/graphlib.core.js",
               "https://raw.githubusercontent.com/cpettitt/graphlib/v1.0.7/dist/graphlib.core.js",
           ],
-      },
-  )
+      },)
 
   filegroup_external(
-      name = "io_github_waylonflinn_weblas",
+      name="io_github_waylonflinn_weblas",
       # no @license header
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
+      licenses=["notice"],  # MIT
+      sha256_urls={
           "f138fce57f673ca8a633f4aee5ae5b6fcb6ad0de59069a42a74e996fd04d8fcc": [
               "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/waylonflinn/weblas/v0.9.0/dist/weblas.js",
               "https://raw.githubusercontent.com/waylonflinn/weblas/v0.9.0/dist/weblas.js",
           ],
-      },
-  )
+      },)
 
   filegroup_external(
-      name = "org_d3js",
+      name="org_d3js",
       # no @license header
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256_urls = {
+      licenses=["notice"],  # BSD-3-Clause
+      sha256_urls={
           "bc1e38838f5c5c8e040132d41efee6bfddbef728210bd566479dc1694af1d3f5": [
               "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/d3/d3/v3.5.15/d3.js",
               "https://raw.githubusercontent.com/d3/d3/v3.5.15/d3.js",
           ],
-      },
-  )
+      },)
 
   filegroup_external(
-      name = "org_definitelytyped",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
+      name="org_definitelytyped",
+      licenses=["notice"],  # MIT
+      sha256_urls={
           "b7da645f6e5555feb7aeede73775da0023ce2257df9c8e76c9159266035a9c0d": [
               "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/chai/chai.d.ts",
               "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/chai/chai.d.ts",
@@ -749,14 +751,13 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
               "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/mocha/mocha.d.ts",
               "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/mocha/mocha.d.ts",
           ],
-      },
-  )
+      },)
 
   filegroup_external(
-      name = "org_threejs",
+      name="org_threejs",
       # no @license header
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
+      licenses=["notice"],  # MIT
+      sha256_urls={
           "7aff264bd84c90bed3c72a4dc31db8c19151853c6df6980f52b01d3e9872c82d": [
               "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/build/three.js",
               "https://raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/build/three.js",
@@ -765,190 +766,179 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
               "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/examples/js/controls/OrbitControls.js",
               "https://raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/examples/js/controls/OrbitControls.js",
           ],
-      },
-  )
+      },)
 
   ##############################################################################
   # TensorBoard JavaScript Testing Dependencies
 
   filegroup_external(
-      name = "com_chaijs",
+      name="com_chaijs",
       # no @license header
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
+      licenses=["notice"],  # MIT
+      sha256_urls={
           "b926b325ad9843bf0b7a6d580ef78bb560e47c484b98680098d4fd9b31b77cd9": [
               "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/chaijs/chai/2.3.0/chai.js",
               "https://raw.githubusercontent.com/chaijs/chai/2.3.0/chai.js",
           ],
-      },
-  )
+      },)
 
   filegroup_external(
-      name = "org_mochajs",
+      name="org_mochajs",
       # no @license header
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
+      licenses=["notice"],  # MIT
+      sha256_urls={
           "e36d865a17ffdf5868e55e736526ae30f3d4bc667c85a2a28cd5c850a82361e2": [
               "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/mochajs/mocha/2.3.4/mocha.js",
               "https://raw.githubusercontent.com/mochajs/mocha/2.3.4/mocha.js",
           ],
-      },
-  )
+      },)
 
   ##############################################################################
   # TensorBoard Polymer Dependencies
 
   webfiles_external(
-      name = "org_polymer_font_roboto",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "fae51429b56a4a4c15f1f0c23b733c7095940cc9c04c275fa7adb3bf055b23b3",
-      urls = [
+      name="org_polymer_font_roboto",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="fae51429b56a4a4c15f1f0c23b733c7095940cc9c04c275fa7adb3bf055b23b3",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/font-roboto/archive/v1.0.1.tar.gz",
           "https://github.com/PolymerElements/font-roboto/archive/v1.0.1.tar.gz",
       ],
-      strip_prefix = "font-roboto-1.0.1",
-      path = "/font-roboto",
-      srcs = ["roboto.html"],
-  )
+      strip_prefix="font-roboto-1.0.1",
+      path="/font-roboto",
+      srcs=["roboto.html"],)
 
   webfiles_external(
-      name = "org_polymer_iron_a11y_announcer",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "6bce143db7a374a68535ec8b861a5f30e81f2f1e4ee36a55bda2a891f6fd2818",
-      urls = [
+      name="org_polymer_iron_a11y_announcer",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="6bce143db7a374a68535ec8b861a5f30e81f2f1e4ee36a55bda2a891f6fd2818",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-a11y-announcer/archive/v1.0.5.tar.gz",
           "https://github.com/PolymerElements/iron-a11y-announcer/archive/v1.0.5.tar.gz",
       ],
-      strip_prefix = "iron-a11y-announcer-1.0.5",
-      path = "/iron-a11y-announcer",
-      srcs = ["iron-a11y-announcer.html"],
-      deps = ["@org_polymer"],
-  )
+      strip_prefix="iron-a11y-announcer-1.0.5",
+      path="/iron-a11y-announcer",
+      srcs=["iron-a11y-announcer.html"],
+      deps=["@org_polymer"],)
 
   webfiles_external(
-      name = "org_polymer_iron_a11y_keys_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "6823efc47a83208fd51d39c5a1d3eb0c0bebc705df1ce01310509da22a13ebd2",
-      urls = [
+      name="org_polymer_iron_a11y_keys_behavior",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="6823efc47a83208fd51d39c5a1d3eb0c0bebc705df1ce01310509da22a13ebd2",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-a11y-keys-behavior/archive/v1.1.8.tar.gz",
           "https://github.com/PolymerElements/iron-a11y-keys-behavior/archive/v1.1.8.tar.gz",
       ],
-      strip_prefix = "iron-a11y-keys-behavior-1.1.8",
-      path = "/iron-a11y-keys-behavior",
-      srcs = ["iron-a11y-keys-behavior.html"],
-      deps = ["@org_polymer"],
-  )
+      strip_prefix="iron-a11y-keys-behavior-1.1.8",
+      path="/iron-a11y-keys-behavior",
+      srcs=["iron-a11y-keys-behavior.html"],
+      deps=["@org_polymer"],)
 
   webfiles_external(
-      name = "org_polymer_iron_ajax",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "9162d8af4611e911ac3ebbfc08bb7038ac04f6e79a9287b1476fe36ad6770bc5",
-      urls = [
+      name="org_polymer_iron_ajax",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="9162d8af4611e911ac3ebbfc08bb7038ac04f6e79a9287b1476fe36ad6770bc5",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-ajax/archive/v1.2.0.tar.gz",
           "https://github.com/PolymerElements/iron-ajax/archive/v1.2.0.tar.gz",
       ],
-      strip_prefix = "iron-ajax-1.2.0",
-      path = "/iron-ajax",
-      srcs = [
+      strip_prefix="iron-ajax-1.2.0",
+      path="/iron-ajax",
+      srcs=[
           "iron-ajax.html",
           "iron-request.html",
       ],
-      deps = [
+      deps=[
           "@org_polymer",
           "@org_polymer_promise_polyfill",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_iron_autogrow_textarea",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "50bbb901d2c8f87462e3552e3d671a552faa12c37c485e548d7a234ebffbc427",
-      urls = [
+      name="org_polymer_iron_autogrow_textarea",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="50bbb901d2c8f87462e3552e3d671a552faa12c37c485e548d7a234ebffbc427",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-autogrow-textarea/archive/v1.0.12.tar.gz",
           "https://github.com/PolymerElements/iron-autogrow-textarea/archive/v1.0.12.tar.gz",
       ],
-      strip_prefix = "iron-autogrow-textarea-1.0.12",
-      path = "/iron-autogrow-textarea",
-      srcs = ["iron-autogrow-textarea.html"],
-      deps = [
+      strip_prefix="iron-autogrow-textarea-1.0.12",
+      path="/iron-autogrow-textarea",
+      srcs=["iron-autogrow-textarea.html"],
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_behaviors",
           "@org_polymer_iron_flex_layout",
           "@org_polymer_iron_form_element_behavior",
           "@org_polymer_iron_validatable_behavior",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_iron_behaviors",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "a1e8d4b7a13f3d36beba9c2a6b186ed33a53e6af2e79f98c1fcc7e85e7b53f89",
-      urls = [
+      name="org_polymer_iron_behaviors",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="a1e8d4b7a13f3d36beba9c2a6b186ed33a53e6af2e79f98c1fcc7e85e7b53f89",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-behaviors/archive/v1.0.17.tar.gz",
           "https://github.com/PolymerElements/iron-behaviors/archive/v1.0.17.tar.gz",
       ],
-      strip_prefix = "iron-behaviors-1.0.17",
-      path = "/iron-behaviors",
-      srcs = [
+      strip_prefix="iron-behaviors-1.0.17",
+      path="/iron-behaviors",
+      srcs=[
           "iron-button-state.html",
           "iron-control-state.html",
       ],
-      deps = [
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_a11y_keys_behavior",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_iron_checked_element_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "539a0e1c4df0bc702d3bd342388e4e56c77ec4c2066cce69e41426a69f92e8bd",
-      urls = [
+      name="org_polymer_iron_checked_element_behavior",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="539a0e1c4df0bc702d3bd342388e4e56c77ec4c2066cce69e41426a69f92e8bd",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-checked-element-behavior/archive/v1.0.4.tar.gz",
           "https://github.com/PolymerElements/iron-checked-element-behavior/archive/v1.0.4.tar.gz",
       ],
-      strip_prefix = "iron-checked-element-behavior-1.0.4",
-      path = "/iron-checked-element-behavior",
-      srcs = ["iron-checked-element-behavior.html"],
-      deps = [
+      strip_prefix="iron-checked-element-behavior-1.0.4",
+      path="/iron-checked-element-behavior",
+      srcs=["iron-checked-element-behavior.html"],
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_form_element_behavior",
           "@org_polymer_iron_validatable_behavior",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_iron_collapse",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "275808994a609a2f9923e2dd2db1957945ab141ba840eadc33f19e1f406d600e",
-      urls = [
+      name="org_polymer_iron_collapse",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="275808994a609a2f9923e2dd2db1957945ab141ba840eadc33f19e1f406d600e",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-collapse/archive/v1.0.8.tar.gz",
           "https://github.com/PolymerElements/iron-collapse/archive/v1.0.8.tar.gz",
       ],
-      strip_prefix = "iron-collapse-1.0.8",
-      path = "/iron-collapse",
-      srcs = ["iron-collapse.html"],
-      deps = [
+      strip_prefix="iron-collapse-1.0.8",
+      path="/iron-collapse",
+      srcs=["iron-collapse.html"],
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_resizable_behavior",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_iron_demo_helpers",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "aa7458492a6ac3d1f6344640a4c2ab07bce64e7ad0422b83b5d665707598cce6",
-      urls = [
+      name="org_polymer_iron_demo_helpers",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="aa7458492a6ac3d1f6344640a4c2ab07bce64e7ad0422b83b5d665707598cce6",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-demo-helpers/archive/v1.1.0.tar.gz",
           "https://github.com/PolymerElements/iron-demo-helpers/archive/v1.1.0.tar.gz",
       ],
-      strip_prefix = "iron-demo-helpers-1.1.0",
-      path = "/iron-demo-helpers",
-      srcs = [
+      strip_prefix="iron-demo-helpers-1.1.0",
+      path="/iron-demo-helpers",
+      srcs=[
           "demo-pages-shared-styles.html",
           "demo-snippet.html",
       ],
-      deps = [
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_flex_layout",
           "@org_polymer_iron_icons",
@@ -956,109 +946,103 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
           "@org_polymer_paper_icon_button",
           "@org_polymer_paper_styles",
           "@org_polymer_prism_element",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_iron_dropdown",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "f7e4a31d096d10d8af1920397695cb17f3eb1cbe5e5ff91a861dabfcc085f376",
-      urls = [
+      name="org_polymer_iron_dropdown",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="f7e4a31d096d10d8af1920397695cb17f3eb1cbe5e5ff91a861dabfcc085f376",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-dropdown/archive/v1.4.0.tar.gz",
           "https://github.com/PolymerElements/iron-dropdown/archive/v1.4.0.tar.gz",
       ],
-      strip_prefix = "iron-dropdown-1.4.0",
-      path = "/iron-dropdown",
-      srcs = [
+      strip_prefix="iron-dropdown-1.4.0",
+      path="/iron-dropdown",
+      srcs=[
           "iron-dropdown.html",
           "iron-dropdown-scroll-manager.html",
       ],
-      deps = [
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_a11y_keys_behavior",
           "@org_polymer_iron_behaviors",
           "@org_polymer_iron_overlay_behavior",
           "@org_polymer_iron_resizable_behavior",
           "@org_polymer_neon_animation",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_iron_fit_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "10132a2ea309a37c4c07b8fead71f64abc588ee6107931e34680f5f36dd8291e",
-      urls = [
+      name="org_polymer_iron_fit_behavior",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="10132a2ea309a37c4c07b8fead71f64abc588ee6107931e34680f5f36dd8291e",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-fit-behavior/archive/v1.2.5.tar.gz",
           "https://github.com/PolymerElements/iron-fit-behavior/archive/v1.2.5.tar.gz",
       ],
-      strip_prefix = "iron-fit-behavior-1.2.5",
-      path = "/iron-fit-behavior",
-      srcs = ["iron-fit-behavior.html"],
-      deps = ["@org_polymer"],
-  )
+      strip_prefix="iron-fit-behavior-1.2.5",
+      path="/iron-fit-behavior",
+      srcs=["iron-fit-behavior.html"],
+      deps=["@org_polymer"],)
 
   webfiles_external(
-      name = "org_polymer_iron_flex_layout",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "79287f6ca1c2d4e003f68b88fe19d03a1b6a0011e2b4cae579fe4d1474163a2e",
-      urls = [
+      name="org_polymer_iron_flex_layout",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="79287f6ca1c2d4e003f68b88fe19d03a1b6a0011e2b4cae579fe4d1474163a2e",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-flex-layout/archive/v1.3.0.tar.gz",
           "https://github.com/PolymerElements/iron-flex-layout/archive/v1.3.0.tar.gz",
       ],
-      strip_prefix = "iron-flex-layout-1.3.0",
-      path = "/iron-flex-layout",
-      srcs = [
+      strip_prefix="iron-flex-layout-1.3.0",
+      path="/iron-flex-layout",
+      srcs=[
           "classes/iron-flex-layout.html",
           "classes/iron-shadow-flex-layout.html",
           "iron-flex-layout.html",
           "iron-flex-layout-classes.html",
       ],
-      deps = ["@org_polymer"],
-  )
+      deps=["@org_polymer"],)
 
   webfiles_external(
-      name = "org_polymer_iron_form_element_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "1dd9371c638e5bc2ecba8a64074aa680dfb8712198e9612f9ed24d387efc8f26",
-      urls = [
+      name="org_polymer_iron_form_element_behavior",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="1dd9371c638e5bc2ecba8a64074aa680dfb8712198e9612f9ed24d387efc8f26",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-form-element-behavior/archive/v1.0.6.tar.gz",
           "https://github.com/PolymerElements/iron-form-element-behavior/archive/v1.0.6.tar.gz",
       ],
-      strip_prefix = "iron-form-element-behavior-1.0.6",
-      path = "/iron-form-element-behavior",
-      srcs = ["iron-form-element-behavior.html"],
-      deps = ["@org_polymer"],
-  )
+      strip_prefix="iron-form-element-behavior-1.0.6",
+      path="/iron-form-element-behavior",
+      srcs=["iron-form-element-behavior.html"],
+      deps=["@org_polymer"],)
 
   webfiles_external(
-      name = "org_polymer_iron_icon",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "9ed58a69159a02c07a6050d242e6d4e585a29f3245b8c8c390cfd52ddb786dc4",
-      urls = [
+      name="org_polymer_iron_icon",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="9ed58a69159a02c07a6050d242e6d4e585a29f3245b8c8c390cfd52ddb786dc4",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-icon/archive/v1.0.11.tar.gz",
           "https://github.com/PolymerElements/iron-icon/archive/v1.0.11.tar.gz",
       ],
-      strip_prefix = "iron-icon-1.0.11",
-      path = "/iron-icon",
-      srcs = ["iron-icon.html"],
-      deps = [
+      strip_prefix="iron-icon-1.0.11",
+      path="/iron-icon",
+      srcs=["iron-icon.html"],
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_flex_layout",
           "@org_polymer_iron_meta",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_iron_icons",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "3b18542c147c7923dc3a36b1a51984a73255d610f297d43c9aaccc52859bd0d0",
-      urls = [
+      name="org_polymer_iron_icons",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="3b18542c147c7923dc3a36b1a51984a73255d610f297d43c9aaccc52859bd0d0",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-icons/archive/v1.1.3.tar.gz",
           "https://github.com/PolymerElements/iron-icons/archive/v1.1.3.tar.gz",
       ],
-      strip_prefix = "iron-icons-1.1.3",
-      path = "/iron-icons",
-      srcs = [
+      strip_prefix="iron-icons-1.1.3",
+      path="/iron-icons",
+      srcs=[
           "av-icons.html",
           "communication-icons.html",
           "device-icons.html",
@@ -1071,247 +1055,233 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
           "places-icons.html",
           "social-icons.html",
       ],
-      deps = [
+      deps=[
           "@org_polymer_iron_icon",
           "@org_polymer_iron_iconset_svg",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_iron_iconset_svg",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "7e3925b7e63a7d22524c4b43ce16ab80d06a576649644783643c11a003284368",
-      urls = [
+      name="org_polymer_iron_iconset_svg",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="7e3925b7e63a7d22524c4b43ce16ab80d06a576649644783643c11a003284368",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-iconset-svg/archive/v1.1.0.tar.gz",
           "https://github.com/PolymerElements/iron-iconset-svg/archive/v1.1.0.tar.gz",
       ],
-      strip_prefix = "iron-iconset-svg-1.1.0",
-      path = "/iron-iconset-svg",
-      srcs = ["iron-iconset-svg.html"],
-      deps = [
+      strip_prefix="iron-iconset-svg-1.1.0",
+      path="/iron-iconset-svg",
+      srcs=["iron-iconset-svg.html"],
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_meta",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_iron_input",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "c505101ead08ab25526b1f49baecc8c28b4221b92a65e7334c783bdc81553c36",
-      urls = [
+      name="org_polymer_iron_input",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="c505101ead08ab25526b1f49baecc8c28b4221b92a65e7334c783bdc81553c36",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-input/archive/1.0.10.tar.gz",
           "https://github.com/PolymerElements/iron-input/archive/1.0.10.tar.gz",
       ],
-      strip_prefix = "iron-input-1.0.10",
-      path = "/iron-input",
-      srcs = ["iron-input.html"],
-      deps = [
+      strip_prefix="iron-input-1.0.10",
+      path="/iron-input",
+      srcs=["iron-input.html"],
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_a11y_announcer",
           "@org_polymer_iron_validatable_behavior",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_iron_list",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "72a6530b9f0ad5557f5d287845792a0ada74d8b159198e27f940e226313dc116",
-      urls = [
+      name="org_polymer_iron_list",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="72a6530b9f0ad5557f5d287845792a0ada74d8b159198e27f940e226313dc116",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-list/archive/v1.3.9.tar.gz",
           "https://github.com/PolymerElements/iron-list/archive/v1.3.9.tar.gz",
       ],
-      strip_prefix = "iron-list-1.3.9",
-      path = "/iron-list",
-      srcs = ["iron-list.html"],
-      deps = [
+      strip_prefix="iron-list-1.3.9",
+      path="/iron-list",
+      srcs=["iron-list.html"],
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_a11y_keys_behavior",
           "@org_polymer_iron_resizable_behavior",
           "@org_polymer_iron_scroll_target_behavior",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_iron_menu_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "ad27889343bc9a709258b073f69abc028bb1ffd3fdb975cd2d3939f7f5d7bb6c",
-      urls = [
+      name="org_polymer_iron_menu_behavior",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="ad27889343bc9a709258b073f69abc028bb1ffd3fdb975cd2d3939f7f5d7bb6c",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-menu-behavior/archive/v1.1.10.tar.gz",
           "https://github.com/PolymerElements/iron-menu-behavior/archive/v1.1.10.tar.gz",
       ],
-      strip_prefix = "iron-menu-behavior-1.1.10",
-      path = "/iron-menu-behavior",
-      srcs = [
+      strip_prefix="iron-menu-behavior-1.1.10",
+      path="/iron-menu-behavior",
+      srcs=[
           "iron-menu-behavior.html",
           "iron-menubar-behavior.html",
       ],
-      deps = [
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_a11y_keys_behavior",
           "@org_polymer_iron_selector",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_iron_meta",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "fb05e6031bae6b4effe5f15d44b3f548d5807f9e3b3aa2442ba17cf4b8b84361",
-      urls = [
+      name="org_polymer_iron_meta",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="fb05e6031bae6b4effe5f15d44b3f548d5807f9e3b3aa2442ba17cf4b8b84361",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-meta/archive/v1.1.1.tar.gz",
           "https://github.com/PolymerElements/iron-meta/archive/v1.1.1.tar.gz",
       ],
-      strip_prefix = "iron-meta-1.1.1",
-      path = "/iron-meta",
-      srcs = ["iron-meta.html"],
-      deps = ["@org_polymer"],
-  )
+      strip_prefix="iron-meta-1.1.1",
+      path="/iron-meta",
+      srcs=["iron-meta.html"],
+      deps=["@org_polymer"],)
 
   webfiles_external(
-      name = "org_polymer_iron_overlay_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "3df5b54ff2e0510c87a2aff8c9d730d3fe83d3d11277cc1a49fa29b549acb46c",
-      urls = [
+      name="org_polymer_iron_overlay_behavior",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="3df5b54ff2e0510c87a2aff8c9d730d3fe83d3d11277cc1a49fa29b549acb46c",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-overlay-behavior/archive/v1.10.1.tar.gz",
           "https://github.com/PolymerElements/iron-overlay-behavior/archive/v1.10.1.tar.gz",
       ],
-      strip_prefix = "iron-overlay-behavior-1.10.1",
-      path = "/iron-overlay-behavior",
-      srcs = [
+      strip_prefix="iron-overlay-behavior-1.10.1",
+      path="/iron-overlay-behavior",
+      srcs=[
           "iron-focusables-helper.html",
           "iron-overlay-backdrop.html",
           "iron-overlay-behavior.html",
           "iron-overlay-manager.html",
       ],
-      deps = [
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_a11y_keys_behavior",
           "@org_polymer_iron_fit_behavior",
           "@org_polymer_iron_resizable_behavior",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_iron_range_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "b2f2b6d52284542330bd30b586e217926eb0adec5e13934a3cef557717c22dc2",
-      urls = [
+      name="org_polymer_iron_range_behavior",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="b2f2b6d52284542330bd30b586e217926eb0adec5e13934a3cef557717c22dc2",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-range-behavior/archive/v1.0.4.tar.gz",
           "https://github.com/PolymerElements/iron-range-behavior/archive/v1.0.4.tar.gz",
       ],
-      strip_prefix = "iron-range-behavior-1.0.4",
-      path = "/iron-range-behavior",
-      srcs = ["iron-range-behavior.html"],
-      deps = ["@org_polymer"],
-  )
+      strip_prefix="iron-range-behavior-1.0.4",
+      path="/iron-range-behavior",
+      srcs=["iron-range-behavior.html"],
+      deps=["@org_polymer"],)
 
   webfiles_external(
-      name = "org_polymer_iron_resizable_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "a87a78ee9223c2f6afae7fc94a3ff91cbce6f7e2a7ed3f2979af7945c9281616",
-      urls = [
+      name="org_polymer_iron_resizable_behavior",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="a87a78ee9223c2f6afae7fc94a3ff91cbce6f7e2a7ed3f2979af7945c9281616",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-resizable-behavior/archive/v1.0.3.tar.gz",
           "https://github.com/PolymerElements/iron-resizable-behavior/archive/v1.0.3.tar.gz",
       ],
-      strip_prefix = "iron-resizable-behavior-1.0.3",
-      path = "/iron-resizable-behavior",
-      srcs = ["iron-resizable-behavior.html"],
-      deps = ["@org_polymer"],
-  )
+      strip_prefix="iron-resizable-behavior-1.0.3",
+      path="/iron-resizable-behavior",
+      srcs=["iron-resizable-behavior.html"],
+      deps=["@org_polymer"],)
 
   webfiles_external(
-      name = "org_polymer_iron_scroll_target_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "d0de0c804b1ec91d814754144afd9da1cdb082690de88bd5e47fd5f41990746f",
-      urls = [
+      name="org_polymer_iron_scroll_target_behavior",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="d0de0c804b1ec91d814754144afd9da1cdb082690de88bd5e47fd5f41990746f",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-scroll-target-behavior/archive/v1.0.3.tar.gz",
           "https://github.com/PolymerElements/iron-scroll-target-behavior/archive/v1.0.3.tar.gz",
       ],
-      strip_prefix = "iron-scroll-target-behavior-1.0.3",
-      path = "/iron-scroll-target-behavior",
-      srcs = ["iron-scroll-target-behavior.html"],
-      deps = ["@org_polymer"],
-  )
+      strip_prefix="iron-scroll-target-behavior-1.0.3",
+      path="/iron-scroll-target-behavior",
+      srcs=["iron-scroll-target-behavior.html"],
+      deps=["@org_polymer"],)
 
   webfiles_external(
-      name = "org_polymer_iron_selector",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "ba28a47443bad3b744611c9d7a79fb21dbdf2e35edc5ef8f812e2dcd72b16747",
-      urls = [
+      name="org_polymer_iron_selector",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="ba28a47443bad3b744611c9d7a79fb21dbdf2e35edc5ef8f812e2dcd72b16747",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-selector/archive/v1.5.2.tar.gz",
           "https://github.com/PolymerElements/iron-selector/archive/v1.5.2.tar.gz",
       ],
-      strip_prefix = "iron-selector-1.5.2",
-      path = "/iron-selector",
-      srcs = [
+      strip_prefix="iron-selector-1.5.2",
+      path="/iron-selector",
+      srcs=[
           "iron-multi-selectable.html",
           "iron-selectable.html",
           "iron-selection.html",
           "iron-selector.html",
       ],
-      deps = ["@org_polymer"],
-  )
+      deps=["@org_polymer"],)
 
   webfiles_external(
-      name = "org_polymer_iron_validatable_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "aef4901e68043824f36104799269573dd345ffaac494186e466fdc79c06fdb63",
-      urls = [
+      name="org_polymer_iron_validatable_behavior",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="aef4901e68043824f36104799269573dd345ffaac494186e466fdc79c06fdb63",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-validatable-behavior/archive/v1.1.1.tar.gz",
           "https://github.com/PolymerElements/iron-validatable-behavior/archive/v1.1.1.tar.gz",
       ],
-      strip_prefix = "iron-validatable-behavior-1.1.1",
-      path = "/iron-validatable-behavior",
-      srcs = ["iron-validatable-behavior.html"],
-      deps = [
+      strip_prefix="iron-validatable-behavior-1.1.1",
+      path="/iron-validatable-behavior",
+      srcs=["iron-validatable-behavior.html"],
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_meta",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_marked",
-      licenses = ["notice"],  # MIT
-      sha256 = "93d30bd593736ca440938d77808b7ef5972da0f3fcfe4ae63ae7b4ce117da2cb",
-      urls = [
+      name="org_polymer_marked",
+      licenses=["notice"],  # MIT
+      sha256="93d30bd593736ca440938d77808b7ef5972da0f3fcfe4ae63ae7b4ce117da2cb",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/chjj/marked/archive/v0.3.2.zip",
           "https://github.com/chjj/marked/archive/v0.3.2.zip",
       ],
-      strip_prefix = "marked-0.3.2",
-      path = "/marked",
-      srcs = ["lib/marked.js"],
-  )
+      strip_prefix="marked-0.3.2",
+      path="/marked",
+      srcs=["lib/marked.js"],)
 
   webfiles_external(
-      name = "org_polymer_marked_element",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "7547616df95f8b903757e6afbabfcdba5322c2bcec3f17c726b8bba5adf4bc5f",
-      urls = [
+      name="org_polymer_marked_element",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="7547616df95f8b903757e6afbabfcdba5322c2bcec3f17c726b8bba5adf4bc5f",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/marked-element/archive/v1.1.3.tar.gz",
           "https://github.com/PolymerElements/marked-element/archive/v1.1.3.tar.gz",
       ],
-      strip_prefix = "marked-element-1.1.3",
-      path = "/marked-element",
-      srcs = [
+      strip_prefix="marked-element-1.1.3",
+      path="/marked-element",
+      srcs=[
           "marked-element.html",
           "marked-import.html",
       ],
-      deps = [
+      deps=[
           "@org_polymer",
           "@org_polymer_marked",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_neon_animation",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "8800c314a76b2da190a2b203259c1091f6d38e0057ed37c2a3d0b734980fa9a5",
-      urls = [
+      name="org_polymer_neon_animation",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="8800c314a76b2da190a2b203259c1091f6d38e0057ed37c2a3d0b734980fa9a5",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/neon-animation/archive/v1.2.2.tar.gz",
           "https://github.com/PolymerElements/neon-animation/archive/v1.2.2.tar.gz",
       ],
-      strip_prefix = "neon-animation-1.2.2",
-      path = "/neon-animation",
-      srcs = [
+      strip_prefix="neon-animation-1.2.2",
+      path="/neon-animation",
+      srcs=[
           "animations/cascaded-animation.html",
           "animations/fade-in-animation.html",
           "animations/fade-out-animation.html",
@@ -1341,155 +1311,148 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
           "neon-shared-element-animation-behavior.html",
           "web-animations.html",
       ],
-      deps = [
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_meta",
           "@org_polymer_iron_resizable_behavior",
           "@org_polymer_iron_selector",
           "@org_polymer_web_animations_js",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_behaviors",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "7cfcb9082ef9909da262df6b5c120bc62dbeaff278cb563e8fc60465ddd387e5",
-      urls = [
+      name="org_polymer_paper_behaviors",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="7cfcb9082ef9909da262df6b5c120bc62dbeaff278cb563e8fc60465ddd387e5",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-behaviors/archive/v1.0.12.tar.gz",
           "https://github.com/PolymerElements/paper-behaviors/archive/v1.0.12.tar.gz",
       ],
-      strip_prefix = "paper-behaviors-1.0.12",
-      path = "/paper-behaviors",
-      srcs = [
+      strip_prefix="paper-behaviors-1.0.12",
+      path="/paper-behaviors",
+      srcs=[
           "paper-button-behavior.html",
           "paper-checked-element-behavior.html",
           "paper-inky-focus-behavior.html",
           "paper-ripple-behavior.html",
       ],
-      deps = [
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_behaviors",
           "@org_polymer_iron_checked_element_behavior",
           "@org_polymer_paper_ripple",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_button",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "896c0a7e34bfcce63fc23c63e105ed9c4d62fa3a6385b7161e1e5cd4058820a6",
-      urls = [
+      name="org_polymer_paper_button",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="896c0a7e34bfcce63fc23c63e105ed9c4d62fa3a6385b7161e1e5cd4058820a6",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-button/archive/v1.0.11.tar.gz",
           "https://github.com/PolymerElements/paper-button/archive/v1.0.11.tar.gz",
       ],
-      strip_prefix = "paper-button-1.0.11",
-      path = "/paper-button",
-      srcs = ["paper-button.html"],
-      deps = [
+      strip_prefix="paper-button-1.0.11",
+      path="/paper-button",
+      srcs=["paper-button.html"],
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_flex_layout",
           "@org_polymer_paper_behaviors",
           "@org_polymer_paper_material",
           "@org_polymer_paper_ripple",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_checkbox",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "6828a6954a048b1230fbd2606faffbae950ba1d042175b96ec50ae355786a166",
-      urls = [
+      name="org_polymer_paper_checkbox",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="6828a6954a048b1230fbd2606faffbae950ba1d042175b96ec50ae355786a166",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-checkbox/archive/v1.4.0.tar.gz",
           "https://github.com/PolymerElements/paper-checkbox/archive/v1.4.0.tar.gz",
       ],
-      strip_prefix = "paper-checkbox-1.4.0",
-      path = "/paper-checkbox",
-      srcs = ["paper-checkbox.html"],
-      deps = [
+      strip_prefix="paper-checkbox-1.4.0",
+      path="/paper-checkbox",
+      srcs=["paper-checkbox.html"],
+      deps=[
           "@org_polymer",
           "@org_polymer_paper_behaviors",
           "@org_polymer_paper_styles",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_dialog",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "c6a9709e7f528d03dcd574503c18b72d4751ca30017346d16e6a791d37ed9259",
-      urls = [
+      name="org_polymer_paper_dialog",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="c6a9709e7f528d03dcd574503c18b72d4751ca30017346d16e6a791d37ed9259",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-dialog/archive/v1.0.4.tar.gz",
           "https://github.com/PolymerElements/paper-dialog/archive/v1.0.4.tar.gz",
       ],
-      strip_prefix = "paper-dialog-1.0.4",
-      path = "/paper-dialog",
-      srcs = ["paper-dialog.html"],
-      deps = [
+      strip_prefix="paper-dialog-1.0.4",
+      path="/paper-dialog",
+      srcs=["paper-dialog.html"],
+      deps=[
           "@org_polymer",
           "@org_polymer_neon_animation",
           "@org_polymer_paper_dialog_behavior",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_dialog_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "a7e0e27ce63554bc14f384cf94bcfa24da8dc5f5120dfd565f45e166261aee40",
-      urls = [
+      name="org_polymer_paper_dialog_behavior",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="a7e0e27ce63554bc14f384cf94bcfa24da8dc5f5120dfd565f45e166261aee40",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-dialog-behavior/archive/v1.2.5.tar.gz",
           "https://github.com/PolymerElements/paper-dialog-behavior/archive/v1.2.5.tar.gz",
       ],
-      strip_prefix = "paper-dialog-behavior-1.2.5",
-      path = "/paper-dialog-behavior",
-      srcs = [
+      strip_prefix="paper-dialog-behavior-1.2.5",
+      path="/paper-dialog-behavior",
+      srcs=[
           "paper-dialog-behavior.html",
           "paper-dialog-common.css",
           "paper-dialog-shared-styles.html",
       ],
-      suppress = ["cssSyntax"],
-      deps = [
+      suppress=["cssSyntax"],
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_flex_layout",
           "@org_polymer_iron_overlay_behavior",
           "@org_polymer_paper_styles",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_dialog_scrollable",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "a2e69283e7674f782c44d811387a0f8da2d01fac0172743d1add65e253e6b5ff",
-      urls = [
+      name="org_polymer_paper_dialog_scrollable",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="a2e69283e7674f782c44d811387a0f8da2d01fac0172743d1add65e253e6b5ff",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-dialog-scrollable/archive/1.1.5.tar.gz",
           "https://github.com/PolymerElements/paper-dialog-scrollable/archive/1.1.5.tar.gz",
       ],
-      strip_prefix = "paper-dialog-scrollable-1.1.5",
-      path = "/paper-dialog-scrollable",
-      srcs = ["paper-dialog-scrollable.html"],
-      deps = [
+      strip_prefix="paper-dialog-scrollable-1.1.5",
+      path="/paper-dialog-scrollable",
+      srcs=["paper-dialog-scrollable.html"],
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_flex_layout",
           "@org_polymer_paper_dialog_behavior",
           "@org_polymer_paper_styles",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_dropdown_menu",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "9d88f654ec03ee9be211df9e69bede9e8a22b51bf1dbcc63b79762e4256d81ad",
-      urls = [
+      name="org_polymer_paper_dropdown_menu",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="9d88f654ec03ee9be211df9e69bede9e8a22b51bf1dbcc63b79762e4256d81ad",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-dropdown-menu/archive/v1.4.0.tar.gz",
           "https://github.com/PolymerElements/paper-dropdown-menu/archive/v1.4.0.tar.gz",
       ],
-      strip_prefix = "paper-dropdown-menu-1.4.0",
-      path = "/paper-dropdown-menu",
-      srcs = [
+      strip_prefix="paper-dropdown-menu-1.4.0",
+      path="/paper-dropdown-menu",
+      srcs=[
           "paper-dropdown-menu.html",
           "paper-dropdown-menu-icons.html",
           "paper-dropdown-menu-light.html",
           "paper-dropdown-menu-shared-styles.html",
       ],
-      deps = [
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_a11y_keys_behavior",
           "@org_polymer_iron_behaviors",
@@ -1502,59 +1465,56 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
           "@org_polymer_paper_menu_button",
           "@org_polymer_paper_ripple",
           "@org_polymer_paper_styles",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_header_panel",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "0db4bd8a4bf6f20dcd0dffb4f907b31c93a8647c9c021344239cf30b40b87075",
-      urls = [
+      name="org_polymer_paper_header_panel",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="0db4bd8a4bf6f20dcd0dffb4f907b31c93a8647c9c021344239cf30b40b87075",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-header-panel/archive/v1.1.4.tar.gz",
           "https://github.com/PolymerElements/paper-header-panel/archive/v1.1.4.tar.gz",
       ],
-      strip_prefix = "paper-header-panel-1.1.4",
-      path = "/paper-header-panel",
-      srcs = ["paper-header-panel.html"],
-      deps = [
+      strip_prefix="paper-header-panel-1.1.4",
+      path="/paper-header-panel",
+      srcs=["paper-header-panel.html"],
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_flex_layout",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_icon_button",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "9cba5bcfd6aeb4c41581c1392c678cf2278d360e9d122f4d9db54a9ebb404496",
-      urls = [
+      name="org_polymer_paper_icon_button",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="9cba5bcfd6aeb4c41581c1392c678cf2278d360e9d122f4d9db54a9ebb404496",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-icon-button/archive/v1.1.3.tar.gz",
           "https://github.com/PolymerElements/paper-icon-button/archive/v1.1.3.tar.gz",
       ],
-      strip_prefix = "paper-icon-button-1.1.3",
-      path = "/paper-icon-button",
-      srcs = [
+      strip_prefix="paper-icon-button-1.1.3",
+      path="/paper-icon-button",
+      srcs=[
           "paper-icon-button.html",
           "paper-icon-button-light.html",
       ],
-      deps = [
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_icon",
           "@org_polymer_paper_behaviors",
           "@org_polymer_paper_styles",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_input",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "17c3dea9bb1c2026cc61324696c6c774214a0dc37686b91ca214a6af550994db",
-      urls = [
+      name="org_polymer_paper_input",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="17c3dea9bb1c2026cc61324696c6c774214a0dc37686b91ca214a6af550994db",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-input/archive/v1.1.18.tar.gz",
           "https://github.com/PolymerElements/paper-input/archive/v1.1.18.tar.gz",
       ],
-      strip_prefix = "paper-input-1.1.18",
-      path = "/paper-input",
-      srcs = [
+      strip_prefix="paper-input-1.1.18",
+      path="/paper-input",
+      srcs=[
           "paper-input.html",
           "paper-input-addon-behavior.html",
           "paper-input-behavior.html",
@@ -1563,7 +1523,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
           "paper-input-error.html",
           "paper-textarea.html",
       ],
-      deps = [
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_a11y_keys_behavior",
           "@org_polymer_iron_autogrow_textarea",
@@ -1572,206 +1532,196 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
           "@org_polymer_iron_form_element_behavior",
           "@org_polymer_iron_input",
           "@org_polymer_paper_styles",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_item",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "12ee0dcb61b0d5721c5988571f6974d7b2211e97724f4195893fbcc9058cdac8",
-      urls = [
+      name="org_polymer_paper_item",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="12ee0dcb61b0d5721c5988571f6974d7b2211e97724f4195893fbcc9058cdac8",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-item/archive/v1.1.4.tar.gz",
           "https://github.com/PolymerElements/paper-item/archive/v1.1.4.tar.gz",
       ],
-      strip_prefix = "paper-item-1.1.4",
-      path = "/paper-item",
-      srcs = [
+      strip_prefix="paper-item-1.1.4",
+      path="/paper-item",
+      srcs=[
           "paper-icon-item.html",
           "paper-item.html",
           "paper-item-behavior.html",
           "paper-item-body.html",
           "paper-item-shared-styles.html",
       ],
-      deps = [
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_behaviors",
           "@org_polymer_iron_flex_layout",
           "@org_polymer_paper_styles",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_listbox",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "3cb35f4fe9a3f15185a9e91711dba8f27e9291c8cd371ebf1be21b8f1d5f65fb",
-      urls = [
+      name="org_polymer_paper_listbox",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="3cb35f4fe9a3f15185a9e91711dba8f27e9291c8cd371ebf1be21b8f1d5f65fb",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-listbox/archive/v1.1.2.tar.gz",
           "https://github.com/PolymerElements/paper-listbox/archive/v1.1.2.tar.gz",
       ],
-      strip_prefix = "paper-listbox-1.1.2",
-      path = "/paper-listbox",
-      srcs = ["paper-listbox.html"],
-      deps = [
+      strip_prefix="paper-listbox-1.1.2",
+      path="/paper-listbox",
+      srcs=["paper-listbox.html"],
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_menu_behavior",
           "@org_polymer_paper_styles",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_material",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "09f6c8bd6ddbea2be541dc86306efe41cdfb31bec0b69d35a5dc29772bbc8506",
-      urls = [
+      name="org_polymer_paper_material",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="09f6c8bd6ddbea2be541dc86306efe41cdfb31bec0b69d35a5dc29772bbc8506",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-material/archive/v1.0.6.tar.gz",
           "https://github.com/PolymerElements/paper-material/archive/v1.0.6.tar.gz",
       ],
-      strip_prefix = "paper-material-1.0.6",
-      path = "/paper-material",
-      srcs = [
+      strip_prefix="paper-material-1.0.6",
+      path="/paper-material",
+      srcs=[
           "paper-material.html",
           "paper-material-shared-styles.html",
       ],
-      deps = [
+      deps=[
           "@org_polymer",
           "@org_polymer_paper_styles",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_menu",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "a3cee220926e315f7412236b3628288774694447c0da4428345f36d0f127ba3b",
-      urls = [
+      name="org_polymer_paper_menu",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="a3cee220926e315f7412236b3628288774694447c0da4428345f36d0f127ba3b",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-menu/archive/v1.2.2.tar.gz",
           "https://github.com/PolymerElements/paper-menu/archive/v1.2.2.tar.gz",
       ],
-      strip_prefix = "paper-menu-1.2.2",
-      path = "/paper-menu",
-      srcs = [
+      strip_prefix="paper-menu-1.2.2",
+      path="/paper-menu",
+      srcs=[
           "paper-menu.html",
           "paper-menu-shared-styles.html",
           "paper-submenu.html",
       ],
-      deps = [
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_behaviors",
           "@org_polymer_iron_collapse",
           "@org_polymer_iron_flex_layout",
           "@org_polymer_iron_menu_behavior",
           "@org_polymer_paper_styles",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_menu_button",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "be3290c288a2bd4f9887213db22c75add99cc29ff4d088100c0bc4eb0e57997b",
-      urls = [
+      name="org_polymer_paper_menu_button",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="be3290c288a2bd4f9887213db22c75add99cc29ff4d088100c0bc4eb0e57997b",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-menu-button/archive/v1.5.1.tar.gz",
           "https://github.com/PolymerElements/paper-menu-button/archive/v1.5.1.tar.gz",
       ],
-      strip_prefix = "paper-menu-button-1.5.1",
-      path = "/paper-menu-button",
-      srcs = [
+      strip_prefix="paper-menu-button-1.5.1",
+      path="/paper-menu-button",
+      srcs=[
           "paper-menu-button.html",
           "paper-menu-button-animations.html",
       ],
-      deps = [
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_a11y_keys_behavior",
           "@org_polymer_iron_behaviors",
           "@org_polymer_iron_dropdown",
           "@org_polymer_neon_animation",
           "@org_polymer_paper_styles",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_progress",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "2b6776b2f023c1f344feea17ba29b58d879e46f8ed43b7256495054b5183fff6",
-      urls = [
+      name="org_polymer_paper_progress",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="2b6776b2f023c1f344feea17ba29b58d879e46f8ed43b7256495054b5183fff6",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-progress/archive/v1.0.9.tar.gz",
           "https://github.com/PolymerElements/paper-progress/archive/v1.0.9.tar.gz",
       ],
-      strip_prefix = "paper-progress-1.0.9",
-      path = "/paper-progress",
-      srcs = ["paper-progress.html"],
-      deps = [
+      strip_prefix="paper-progress-1.0.9",
+      path="/paper-progress",
+      srcs=["paper-progress.html"],
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_flex_layout",
           "@org_polymer_iron_range_behavior",
           "@org_polymer_paper_styles",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_radio_button",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "6e911d0c308aa388136b3af79d1bdcbe5a1f4159cbc79d71efb4ff3b6c0b4e91",
-      urls = [
+      name="org_polymer_paper_radio_button",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="6e911d0c308aa388136b3af79d1bdcbe5a1f4159cbc79d71efb4ff3b6c0b4e91",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-radio-button/archive/v1.1.2.tar.gz",
           "https://github.com/PolymerElements/paper-radio-button/archive/v1.1.2.tar.gz",
       ],
-      strip_prefix = "paper-radio-button-1.1.2",
-      path = "/paper-radio-button",
-      srcs = ["paper-radio-button.html"],
-      deps = [
+      strip_prefix="paper-radio-button-1.1.2",
+      path="/paper-radio-button",
+      srcs=["paper-radio-button.html"],
+      deps=[
           "@org_polymer",
           "@org_polymer_paper_behaviors",
           "@org_polymer_paper_styles",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_radio_group",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "7885ad1f81e9dcc03dcea4139b54a201ff55c18543770cd44f94530046c9e163",
-      urls = [
+      name="org_polymer_paper_radio_group",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="7885ad1f81e9dcc03dcea4139b54a201ff55c18543770cd44f94530046c9e163",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-radio-group/archive/v1.0.9.tar.gz",
           "https://github.com/PolymerElements/paper-radio-group/archive/v1.0.9.tar.gz",
       ],
-      strip_prefix = "paper-radio-group-1.0.9",
-      path = "/paper-radio-group",
-      srcs = ["paper-radio-group.html"],
-      deps = [
+      strip_prefix="paper-radio-group-1.0.9",
+      path="/paper-radio-group",
+      srcs=["paper-radio-group.html"],
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_a11y_keys_behavior",
           "@org_polymer_iron_selector",
           "@org_polymer_paper_radio_button",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_ripple",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "ba76bfb1c737260a8a103d3ca97faa1f7c3288c7db9b2519f401b7a782147c09",
-      urls = [
+      name="org_polymer_paper_ripple",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="ba76bfb1c737260a8a103d3ca97faa1f7c3288c7db9b2519f401b7a782147c09",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-ripple/archive/v1.0.5.tar.gz",
           "https://github.com/PolymerElements/paper-ripple/archive/v1.0.5.tar.gz",
       ],
-      strip_prefix = "paper-ripple-1.0.5",
-      path = "/paper-ripple",
-      srcs = ["paper-ripple.html"],
-      deps = [
+      strip_prefix="paper-ripple-1.0.5",
+      path="/paper-ripple",
+      srcs=["paper-ripple.html"],
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_a11y_keys_behavior",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_slider",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "08e7c541dbf5d2e959208810bfc03188e82ced87e4d30d325172967f67962c3c",
-      urls = [
+      name="org_polymer_paper_slider",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="08e7c541dbf5d2e959208810bfc03188e82ced87e4d30d325172967f67962c3c",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-slider/archive/v1.0.10.tar.gz",
           "https://github.com/PolymerElements/paper-slider/archive/v1.0.10.tar.gz",
       ],
-      strip_prefix = "paper-slider-1.0.10",
-      path = "/paper-slider",
-      srcs = ["paper-slider.html"],
-      deps = [
+      strip_prefix="paper-slider-1.0.10",
+      path="/paper-slider",
+      srcs=["paper-slider.html"],
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_a11y_keys_behavior",
           "@org_polymer_iron_flex_layout",
@@ -1781,43 +1731,39 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
           "@org_polymer_paper_input",
           "@org_polymer_paper_progress",
           "@org_polymer_paper_styles",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_spinner",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "6a752907fab7899cbeed15b478e7b9299047c15fbf9d1561d6eb4d204bdbd178",
-      urls = [
+      name="org_polymer_paper_spinner",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="6a752907fab7899cbeed15b478e7b9299047c15fbf9d1561d6eb4d204bdbd178",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-spinner/archive/v1.1.1.tar.gz",
           "https://github.com/PolymerElements/paper-spinner/archive/v1.1.1.tar.gz",
       ],
-      strip_prefix = "paper-spinner-1.1.1",
-      path = "/paper-spinner",
-      srcs = [
-          "paper-spinner.html",
-          "paper-spinner-behavior.html",
-          "paper-spinner-lite.html",
-          "paper-spinner-styles.html"
+      strip_prefix="paper-spinner-1.1.1",
+      path="/paper-spinner",
+      srcs=[
+          "paper-spinner.html", "paper-spinner-behavior.html",
+          "paper-spinner-lite.html", "paper-spinner-styles.html"
       ],
-      deps = [
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_flex_layout",
           "@org_polymer_paper_styles",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_styles",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "6d26b0a4c286402098853dc7388f6b22f30dfb7a74e47b34992ac03380144bb2",
-      urls = [
+      name="org_polymer_paper_styles",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="6d26b0a4c286402098853dc7388f6b22f30dfb7a74e47b34992ac03380144bb2",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-styles/archive/v1.1.4.tar.gz",
           "https://github.com/PolymerElements/paper-styles/archive/v1.1.4.tar.gz",
       ],
-      strip_prefix = "paper-styles-1.1.4",
-      path = "/paper-styles",
-      srcs = [
+      strip_prefix="paper-styles-1.1.4",
+      path="/paper-styles",
+      srcs=[
           "classes/global.html",
           "classes/shadow.html",
           "classes/shadow-layout.html",
@@ -1831,29 +1777,28 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
           "shadow.html",
           "typography.html",
       ],
-      deps = [
+      deps=[
           "@org_polymer",
           "@org_polymer_font_roboto",
           "@org_polymer_iron_flex_layout",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_tabs",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "c23b6a5221db35e5b1ed3eb8e8696b952572563e285adaec96aba1e3134db825",
-      urls = [
+      name="org_polymer_paper_tabs",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="c23b6a5221db35e5b1ed3eb8e8696b952572563e285adaec96aba1e3134db825",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-tabs/archive/v1.7.0.tar.gz",
           "https://github.com/PolymerElements/paper-tabs/archive/v1.7.0.tar.gz",
       ],
-      strip_prefix = "paper-tabs-1.7.0",
-      path = "/paper-tabs",
-      srcs = [
+      strip_prefix="paper-tabs-1.7.0",
+      path="/paper-tabs",
+      srcs=[
           "paper-tab.html",
           "paper-tabs.html",
           "paper-tabs-icons.html",
       ],
-      deps = [
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_behaviors",
           "@org_polymer_iron_flex_layout",
@@ -1864,177 +1809,165 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
           "@org_polymer_paper_behaviors",
           "@org_polymer_paper_icon_button",
           "@org_polymer_paper_styles",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_toast",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "55f623712ed1f2bae6d6fadc522a2458e083ccd44cc0a907672547e7b10758a9",
-      urls = [
+      name="org_polymer_paper_toast",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="55f623712ed1f2bae6d6fadc522a2458e083ccd44cc0a907672547e7b10758a9",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-toast/archive/v1.3.0.tar.gz",
           "https://github.com/PolymerElements/paper-toast/archive/v1.3.0.tar.gz",
       ],
-      strip_prefix = "paper-toast-1.3.0",
-      path = "/paper-toast",
-      srcs = ["paper-toast.html"],
-      deps = [
+      strip_prefix="paper-toast-1.3.0",
+      path="/paper-toast",
+      srcs=["paper-toast.html"],
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_a11y_announcer",
           "@org_polymer_iron_overlay_behavior",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_toggle_button",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "4aa7cf0396fa2994a8bc2ac6e8428f48b07b945bb7c41bd52041ef5827b45de3",
-      urls = [
+      name="org_polymer_paper_toggle_button",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="4aa7cf0396fa2994a8bc2ac6e8428f48b07b945bb7c41bd52041ef5827b45de3",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-toggle-button/archive/v1.2.0.tar.gz",
           "https://github.com/PolymerElements/paper-toggle-button/archive/v1.2.0.tar.gz",
       ],
-      strip_prefix = "paper-toggle-button-1.2.0",
-      path = "/paper-toggle-button",
-      srcs = ["paper-toggle-button.html"],
-      deps = [
+      strip_prefix="paper-toggle-button-1.2.0",
+      path="/paper-toggle-button",
+      srcs=["paper-toggle-button.html"],
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_flex_layout",
           "@org_polymer_paper_behaviors",
           "@org_polymer_paper_styles",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_toolbar",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "dbddffc0654d9fb5fb48843087eebe16bf7a134902495a664c96c11bf8a2c63d",
-      urls = [
+      name="org_polymer_paper_toolbar",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="dbddffc0654d9fb5fb48843087eebe16bf7a134902495a664c96c11bf8a2c63d",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-toolbar/archive/v1.1.4.tar.gz",
           "https://github.com/PolymerElements/paper-toolbar/archive/v1.1.4.tar.gz",
       ],
-      strip_prefix = "paper-toolbar-1.1.4",
-      path = "/paper-toolbar",
-      srcs = ["paper-toolbar.html"],
-      deps = [
+      strip_prefix="paper-toolbar-1.1.4",
+      path="/paper-toolbar",
+      srcs=["paper-toolbar.html"],
+      deps=[
           "@org_polymer",
           "@org_polymer_iron_flex_layout",
           "@org_polymer_paper_styles",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_paper_tooltip",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "4c6667acf01f73da14c3cbc0aa574bf14280304567987ee0314534328377d2ad",
-      urls = [
+      name="org_polymer_paper_tooltip",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="4c6667acf01f73da14c3cbc0aa574bf14280304567987ee0314534328377d2ad",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-tooltip/archive/v1.1.2.tar.gz",
           "https://github.com/PolymerElements/paper-tooltip/archive/v1.1.2.tar.gz",
       ],
-      strip_prefix = "paper-tooltip-1.1.2",
-      path = "/paper-tooltip",
-      srcs = ["paper-tooltip.html"],
-      deps = [
+      strip_prefix="paper-tooltip-1.1.2",
+      path="/paper-tooltip",
+      srcs=["paper-tooltip.html"],
+      deps=[
           "@org_polymer",
           "@org_polymer_neon_animation",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "07a9e62ffb52193da3af09adda2fbac5cc690439978520e2d03e783863f65f91",
-      strip_prefix = "polymer-1.7.0",
-      urls = [
+      name="org_polymer",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="07a9e62ffb52193da3af09adda2fbac5cc690439978520e2d03e783863f65f91",
+      strip_prefix="polymer-1.7.0",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/polymer/polymer/archive/v1.7.0.tar.gz",
           "https://github.com/polymer/polymer/archive/v1.7.0.tar.gz",
       ],
-      path = "/polymer",
-      srcs = [
+      path="/polymer",
+      srcs=[
           "polymer.html",
           "polymer-micro.html",
           "polymer-mini.html",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_prism",
-      licenses = ["notice"],  # MIT
-      sha256 = "e06eb54f2a80e6b3cd0bd4d59f900423bcaee53fc03998a056df63740c684683",
-      urls = [
+      name="org_polymer_prism",
+      licenses=["notice"],  # MIT
+      sha256="e06eb54f2a80e6b3cd0bd4d59f900423bcaee53fc03998a056df63740c684683",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PrismJS/prism/archive/abee2b7587f1925e57777044270e2a1860810994.tar.gz",
           "https://github.com/PrismJS/prism/archive/abee2b7587f1925e57777044270e2a1860810994.tar.gz",
       ],
-      strip_prefix = "prism-abee2b7587f1925e57777044270e2a1860810994",
-      path = "/prism",
-      srcs = [
+      strip_prefix="prism-abee2b7587f1925e57777044270e2a1860810994",
+      path="/prism",
+      srcs=[
           "prism.js",
           "themes/prism.css",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_prism_element",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "ad70bf9cd5bbdf525d465e1b0658867ab4022193eb9c74087a839044b46312b4",
-      urls = [
+      name="org_polymer_prism_element",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="ad70bf9cd5bbdf525d465e1b0658867ab4022193eb9c74087a839044b46312b4",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/prism-element/archive/1.0.4.tar.gz",
           "https://github.com/PolymerElements/prism-element/archive/1.0.4.tar.gz",
       ],
-      strip_prefix = "prism-element-1.0.4",
-      path = "/prism-element",
-      srcs = [
+      strip_prefix="prism-element-1.0.4",
+      path="/prism-element",
+      srcs=[
           "prism-highlighter.html",
           "prism-import.html",
       ],
-      deps = [
+      deps=[
           "@org_polymer",
           "@org_polymer_prism",
-      ],
-  )
+      ],)
 
   webfiles_external(
-      name = "org_polymer_promise_polyfill",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "4495450e5d884c3e16b537b43afead7f84d17c7dc061bcfcbf440eac083e4ef5",
-      strip_prefix = "promise-polyfill-1.0.0",
-      urls = [
+      name="org_polymer_promise_polyfill",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="4495450e5d884c3e16b537b43afead7f84d17c7dc061bcfcbf440eac083e4ef5",
+      strip_prefix="promise-polyfill-1.0.0",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/PolymerLabs/promise-polyfill/archive/v1.0.0.tar.gz",
           "https://github.com/PolymerLabs/promise-polyfill/archive/v1.0.0.tar.gz",
       ],
-      path = "/promise-polyfill",
-      srcs = [
-          "Promise.js",
-          "Promise-Statics.js",
-          "promise-polyfill.html",
+      path="/promise-polyfill",
+      srcs=[
+          "Promise.js", "Promise-Statics.js", "promise-polyfill.html",
           "promise-polyfill-lite.html"
       ],
-      deps = ["@org_polymer"],
-  )
+      deps=["@org_polymer"],)
 
   webfiles_external(
-      name = "org_polymer_web_animations_js",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "f8bd760cbdeba131f6790bd5abe170bcbf7b1755ff58ed16d0b82fa8a7f34a7f",
-      urls = [
+      name="org_polymer_web_animations_js",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="f8bd760cbdeba131f6790bd5abe170bcbf7b1755ff58ed16d0b82fa8a7f34a7f",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/web-animations/web-animations-js/archive/2.2.1.tar.gz",
           "https://github.com/web-animations/web-animations-js/archive/2.2.1.tar.gz",
       ],
-      strip_prefix = "web-animations-js-2.2.1",
-      path = "/web-animations-js",
-      srcs = ["web-animations-next-lite.min.js"],
-  )
+      strip_prefix="web-animations-js-2.2.1",
+      path="/web-animations-js",
+      srcs=["web-animations-next-lite.min.js"],)
 
   webfiles_external(
-      name = "org_polymer_webcomponentsjs",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "138c43306ee0a6d699ddca9b3c6b0f4982974ea8b7bdad291ea7276c72301df9",
-      urls = [
+      name="org_polymer_webcomponentsjs",
+      licenses=["notice"],  # BSD-3-Clause
+      sha256="138c43306ee0a6d699ddca9b3c6b0f4982974ea8b7bdad291ea7276c72301df9",
+      urls=[
           "http://bazel-mirror.storage.googleapis.com/github.com/webcomponents/webcomponentsjs/archive/v0.7.22.tar.gz",
           "https://github.com/webcomponents/webcomponentsjs/archive/v0.7.22.tar.gz",
       ],
-      strip_prefix = "webcomponentsjs-0.7.22",
-      path = "/webcomponentsjs",
-      srcs = [
+      strip_prefix="webcomponentsjs-0.7.22",
+      path="/webcomponentsjs",
+      srcs=[
           "CustomElements.js",
           "CustomElements.min.js",
           "HTMLImports.js",
@@ -2047,5 +1980,4 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
           "webcomponents.min.js",
           "webcomponents-lite.js",
           "webcomponents-lite.min.js",
-      ],
-  )
+      ],)
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index bbe0442eaf..05ff584be0 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -39,6 +39,11 @@ _DEFAULT_CUDA_COMPUTE_CAPABILITIES = ["3.5", "5.2"]
 # BEGIN cc_configure common functions.
 def find_cc(repository_ctx):
   """Find the C++ compiler."""
+  # On Windows, we use Bazel's MSVC CROSSTOOL for GPU build
+  # Return a dummy value for GCC detection here to avoid error
+  if _cpu_value(repository_ctx) == "Windows":
+    return "/use/--config x64_windows_msvc/instead"
+
   if _use_cuda_clang(repository_ctx):
     target_cc_name = "clang"
     cc_path_envvar = _CLANG_CUDA_COMPILER_PATH
@@ -297,7 +302,7 @@ def _find_cuda_define(repository_ctx, cudnn_header_dir, define):
   cudnn_h_path = repository_ctx.path("%s/cudnn.h" % cudnn_header_dir)
   if not cudnn_h_path.exists:
     auto_configure_fail("Cannot find cudnn.h at %s" % str(cudnn_h_path))
-  result = repository_ctx.execute(["grep", "-E", define, str(cudnn_h_path)])
+  result = repository_ctx.execute(["grep", "--color=never", "-E", define, str(cudnn_h_path)])
   if result.stderr:
     auto_configure_fail("Error reading %s: %s" %
                         (result.stderr, str(cudnn_h_path)))
@@ -874,6 +879,7 @@ def _cuda_autoconf_impl(repository_ctx):
     _create_cuda_repository(repository_ctx)
 
 
+
 cuda_configure = repository_rule(
     implementation = _cuda_autoconf_impl,
     environ = [
diff --git a/third_party/libxsmm.BUILD b/third_party/libxsmm.BUILD
index 037009c072..53a814b4b8 100644
--- a/third_party/libxsmm.BUILD
+++ b/third_party/libxsmm.BUILD
@@ -12,7 +12,7 @@ libxsmm_interface_arguments = "0 1"
 
 # Arguments to ./scripts/libxsmm_config.py, see that file for detailed description.
 #  ilp64: 0 (no)
-#  big: 0 (no)
+#  big: 1 (yes)
 #  offload: 0 (no)
 #  alignment [b]
 #  prefetch: -1 (auto)
@@ -22,7 +22,7 @@ libxsmm_interface_arguments = "0 1"
 #  flags: 0 (none)
 #  alpha = 1
 #  beta = 1
-libxsmm_config_arguments = "0 0 0 64 -1 0 1 1 0 1 1"
+libxsmm_config_arguments = "0 1 0 64 -1 0 1 1 0 1 1"
 
 # Arguments to ./scripts/libxsmm_dispatch.py, see that file for detailed description.
 #  (dummy argument)
@@ -56,22 +56,26 @@ genrule(
 cc_library(
     name = "xsmm_avx",
     srcs = [
-        "src/libxsmm_main.c",
+        "src/libxsmm_cpuid_x86.c",
+        "src/libxsmm_dnn.c",
+        "src/libxsmm_dnn_convolution_backward.c",
+        "src/libxsmm_dnn_convolution_forward.c",
+        "src/libxsmm_dnn_convolution_weight_update.c",
+        "src/libxsmm_dnn_convolution_winograd_backward.c",
+        "src/libxsmm_dnn_convolution_winograd_forward.c",
+        "src/libxsmm_dnn_convolution_winograd_weight_update.c",
+        "src/libxsmm_dnn_handle.c",
         "src/libxsmm_dump.c",
-        "src/libxsmm_malloc.c",
+        "src/libxsmm_fsspmdm.c",
         "src/libxsmm_gemm.c",
+        "src/libxsmm_main.c",
+        "src/libxsmm_malloc.c",
+        "src/libxsmm_perf.c",
+        "src/libxsmm_spmdm.c",
+        "src/libxsmm_sync.c",
         "src/libxsmm_timer.c",
         "src/libxsmm_trace.c",
         "src/libxsmm_trans.c",
-        "src/libxsmm_sync.c",
-        "src/libxsmm_perf.c",
-        "src/libxsmm_spmdm.c",
-        "src/libxsmm_dnn.c",
-        "src/libxsmm_dnn_handle.c",
-        "src/libxsmm_dnn_convolution_forward.c",
-        "src/libxsmm_dnn_convolution_backward.c",
-        "src/libxsmm_dnn_convolution_weight_update.c",
-        "src/libxsmm_cpuid_x86.c",
     ] + glob([
         "src/generator_*.c",
     ]),
@@ -79,6 +83,7 @@ cc_library(
         "include/libxsmm_cpuid.h",
         "include/libxsmm_dnn.h",
         "include/libxsmm_frontend.h",
+        "include/libxsmm_fsspmdm.h",
         "include/libxsmm_generator.h",
         "include/libxsmm_intrinsics_x86.h",
         "include/libxsmm_macros.h",
@@ -91,14 +96,16 @@ cc_library(
         "include/libxsmm.h",
         "include/libxsmm_config.h",
         "include/libxsmm_dispatch.h",
-    ],
+    ] + glob([
+        # trigger rebuild if template changed
+        "src/template/*.c",
+    ]),
     copts = [
         "-mavx",  # JIT does not work without avx anyway, and this silences some CRC32 warnings.
         "-Wno-vla",  # Libxsmm convolutions heavily use VLA.
     ],
     defines = [
         "LIBXSMM_BUILD",
-        "LIBXSMM_CPUID_X86_NOINLINE",
         "__BLAS=0",
     ],
     includes = [
author	A. Unique TensorFlower <gardener@tensorflow.org>	2017-04-04 16:10:08 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-04-04 17:24:57 -0700
commit	ccbc8991db3943ef984405881a1c917c530f902f (patch)
tree	a7b5c760155bfa4ff95ffc0ebd3823c649668997
parent	9477900946f923cb43ed76ed215490d01474bfe7 (diff)