146 files changed, 5695 insertions, 979 deletions
diff --git a/README.md b/README.md
index dcea7ac233..4cc53096e0 100644
--- a/README.md
+++ b/README.md
@@ -40,16 +40,16 @@ People who are a little more adventurous can also try our nightly binaries:
 * We are pleased to announce that TensorFlow now offers nightly pip packages
 under the [tf-nightly](https://pypi.python.org/pypi/tf-nightly) project on pypi.
 Simply run `pip install tf-nightly` in a clean environment to install the nightly
-tensorflow  build. We currently only support CPU-only packages on Linux and Mac.
-GPU packages on all platforms and Windows CPU-only packages will arrive soon!
+tensorflow  build. We currently only support CPU packages on Linux, Mac, and Windows.
+GPU packages on all platforms will arrive soon!
 
 
 **Individual whl files**
-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.4.0dev-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.4.0dev-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.4.0dev-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.4.0dev-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.4.0dev-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.4.0dev-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
-* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.4.0dev-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.4.0dev-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
-* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.4.0dev-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.4.0dev-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/))
-* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.4.0dev-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.4.0dev-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/))
+* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/))
+* Linux GPU: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/42/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
+* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
+* Windows CPU-only: [Python 3.5 64-bit](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp35-cp35m-win_amd64.whl) ([build history](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/)) / [Python 3.6 64-bit](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp36-cp36m-win_amd64.whl) ([build history](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/))
+* Windows GPU: Coming soon!
 * Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
 ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/))
 
diff --git a/configure.py b/configure.py
index fecca405c7..df2c74d23d 100644
--- a/configure.py
+++ b/configure.py
@@ -251,7 +251,7 @@ def reset_tf_configure_bazelrc():
   if not os.path.exists('.bazelrc'):
     if os.path.exists(os.path.join(home, '.bazelrc')):
       with open('.bazelrc', 'a') as f:
-        f.write('import %s/.bazelrc\n' % home)
+        f.write('import %s/.bazelrc\n' % home.replace('\\', '/'))
     else:
       open('.bazelrc', 'w').close()
 
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 4054318c4c..924f383a8e 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -381,6 +381,7 @@ filegroup(
         "//tensorflow/contrib/losses:all_files",
         "//tensorflow/contrib/meta_graph_transform:all_files",
         "//tensorflow/contrib/metrics:all_files",
+        "//tensorflow/contrib/mpi_collectives:all_files",
         "//tensorflow/contrib/ndlstm:all_files",
         "//tensorflow/contrib/nearest_neighbor:all_files",
         "//tensorflow/contrib/nn:all_files",
@@ -391,6 +392,7 @@ filegroup(
         "//tensorflow/contrib/remote_fused_graph/pylib:all_files",
         "//tensorflow/contrib/resampler:all_files",
         "//tensorflow/contrib/rnn:all_files",
+        "//tensorflow/contrib/s3:all_files",
         "//tensorflow/contrib/saved_model:all_files",
         "//tensorflow/contrib/saved_model/cc/saved_model:all_files",
         "//tensorflow/contrib/seq2seq:all_files",
diff --git a/tensorflow/cc/framework/gradients.cc b/tensorflow/cc/framework/gradients.cc
index 82469261e5..b665ce744d 100644
--- a/tensorflow/cc/framework/gradients.cc
+++ b/tensorflow/cc/framework/gradients.cc
@@ -175,8 +175,14 @@ Status SymbolicGradientBuilder::Initialize() {
         "Must specify a gradient input for each output.");
   }
   std::vector<bool> reachable_nodes = GetReachableNodes();
-  // TODO(theflofly) Check that inputs_ are reachable from
-  // outputs_ using reachable_nodes
+  for (const Output& input : inputs_) {
+    if (!reachable_nodes[input.node()->id()]) {
+      return errors::InvalidArgument(
+          "Cannot compute the partial derivative for node '",
+          input.node()->name(),
+          "' as it's unreachable from the output node(s).");
+    }
+  }
   grad_outputs_->clear();
   grad_outputs_->resize(inputs_.size());
   // Populate `output_nodes_` from node ids in `outputs_`.
diff --git a/tensorflow/cc/framework/gradients_test.cc b/tensorflow/cc/framework/gradients_test.cc
index 032ab93623..dcaf10c340 100644
--- a/tensorflow/cc/framework/gradients_test.cc
+++ b/tensorflow/cc/framework/gradients_test.cc
@@ -48,9 +48,9 @@ class GradientsTest : public ::testing::Test {
   Scope scope_test_;
 };
 
-// EX.
+// Example:
 //      ^             ^
-//    dy|           dx|        // MatMul Gradient Graph
+//    dy|           dx|        (MatMul Gradient Graph)
 //      |             |
 //   MatMul_1      MatMul_2
 //   ^   ^          ^    ^
@@ -61,7 +61,7 @@ class GradientsTest : public ::testing::Test {
 //   |     Const_3       |
 //   |                   |
 //   |        ^          |
-//   |       z|          |     // MatMul Forward Graph
+//   |       z|          |     (MatMul Forward Graph)
 //   |        |          |
 //   |      MatMul_0     |
 //   |     /        \    |
@@ -373,24 +373,22 @@ TEST_F(GradientsTest, UnreachableEdgeGradOneOutput) {
   auto y_const = Const(scope_test_, {{1.0}, {2.0}, {3.0}});
   auto y_assign = Assign(scope_test_, y, y_const);
 
-  auto m1 = MatMul(scope_test_, x, y);
+  auto m = MatMul(scope_test_, x, y);
 
   auto z = Variable(scope_test_, {1, 3}, DT_DOUBLE);
   auto z_const = Const(scope_test_, {{9.0, 10.0, 11.0}});
   auto z_assign = Assign(scope_test_, z, z_const);
 
-  auto m2 = MatMul(scope_test_, y, z);
-
-  auto dm1 = Const(scope_test_, {{0.5}, {0.5}});
+  auto diff_m = Const(scope_test_, {{0.5}, {0.5}});
 
   std::vector<Output> grad_outputs;
   TF_ASSERT_OK(
-      AddSymbolicGradients(scope_test_, {m1}, {y}, {dm1}, &grad_outputs));
+      AddSymbolicGradients(scope_test_, {m}, {y}, {diff_m}, &grad_outputs));
 
   std::vector<Tensor> outputs;
   test::GetTensors(scope_test_, {x_assign, y_assign, z_assign},
                    {grad_outputs[0]}, &outputs);
-  // dz/dy = xT * dm1
+  // dz/dy = xT * diff_m
   test::ExpectTensorNear<double>(
       outputs[0], test::AsTensor<double>({2.5, 3.5, 4.5}, {3, 1}), 1e-5);
 }
@@ -424,13 +422,37 @@ TEST_F(GradientsTest, UnreachableEdgeGradTwoOutputs) {
   test::GetTensors(scope_test_, {x_assign, y_assign, z_assign},
                    {grad_outputs[0]}, &outputs);
 
-  // the gradients from m1 and m2 will be summed to compute the gradient
-  // w.r.t y
+  // The gradients from m1 and m2 will be summed to compute the gradient
+  // w.r.t y:
   // dz/dy = xT * dm1 + dm2 * zT
   test::ExpectTensorNear<double>(
       outputs[0], test::AsTensor<double>({17.5, 24.7, 26.8}, {3, 1}), 1e-5);
 }
 
+TEST_F(GradientsTest, UnreachableInput) {
+  auto x = Const(scope_test_, {{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}});
+  auto y = Const(scope_test_, {{1.0}, {2.0}, {3.0}});
+  auto z = Const(scope_test_.WithOpName("z"), {{9.0, 10.0, 11.0}});
+
+  auto m1 = MatMul(scope_test_, x, y);
+  auto m2 = MatMul(scope_test_, y, z);
+  auto dm1 = Const(scope_test_, {{0.5}, {0.5}});
+
+  // From m1, z is unreachable, so an error status should be returned.
+  //   m2  m1
+  //   |   |
+  //   *   *
+  //  / \ / \
+  // z   y   x
+  std::vector<Output> grad_outputs;
+  Status status =
+      AddSymbolicGradients(scope_test_, {m1}, {z}, {dm1}, &grad_outputs);
+  EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+  EXPECT_EQ(status.error_message(),
+            "Cannot compute the partial derivative"
+            " for node 'z' as it's unreachable from the output node(s).");
+}
+
 // StopGradientSingleOutputMultiEdgeTest tests combinations of valid and
 // 'NoGradient' (induced by StopGradient op) returned along multiple edges from
 // a single nodes output.
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
index 9d2703bf95..248e9d111e 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
@@ -85,6 +85,10 @@ class FusedBatchNormGradOp : public XlaOpKernel {
     string data_format;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("epsilon", &epsilon_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
+    bool is_training;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("is_training", &is_training));
+    CHECK(is_training) << "FusedBatchNormGradOp with is_training=False cannot "
+                          "be used with XLA for now!";
     TensorFormat tensor_format;
     if (ctx->GetAttr("data_format", &data_format).ok()) {
       OP_REQUIRES(ctx, FormatFromString(data_format, &tensor_format),
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index 20e0d8eb78..89145a9038 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -33,6 +33,7 @@ using se::dnn::ConvolutionDescriptor;
 using se::dnn::DataLayout;
 using se::dnn::FilterDescriptor;
 using se::dnn::FilterLayout;
+using se::dnn::AlgorithmDesc;
 
 ConvolveScratchAllocator::ConvolveScratchAllocator(
     int device_ordinal, DeviceMemoryAllocator* memory_allocator)
@@ -251,12 +252,13 @@ tensorflow::Status ConvolutionThunk::Convolve(
       "Unable to launch convolution for thunk %p with type %s and algorithm "
       "(%lld, %lld)",
       this, ConvolutionKindToString(convolution_kind_).c_str(),
-      algorithm_config.algorithm(), algorithm_config.algorithm_no_scratch());
+      algorithm_config.algorithm().algo_id(),
+      algorithm_config.algorithm_no_scratch().algo_id());
 }
 
-std::vector<se::dnn::AlgorithmType> ConvolutionThunk::GetAlgorithms(
+std::vector<AlgorithmDesc::Index> ConvolutionThunk::GetAlgorithms(
     se::StreamExecutor* stream_exec) const {
-  std::vector<se::dnn::AlgorithmType> algorithms;
+  std::vector<AlgorithmDesc::Index> algorithms;
   // TODO(yangzihao): Currently disable the use of winograd nonfused in XLA
   // by default. Should send in conv parameters and enable it when
   // ShouldIncludeWinogradNonfusedAlgo() returns true.
@@ -286,7 +288,7 @@ tensorflow::Status ConvolutionThunk::ConvolveWithTune(
     const ConvolutionDescriptor& convolution_descriptor,
     const BufferAllocations& buffer_allocations, se::Stream* stream) {
   // TODO(b/29126320): Try cudnn v5's new auto-tuner when it's rolled out.
-  if (best_algorithm_.algorithm() == se::dnn::kDefaultAlgorithm) {
+  if (best_algorithm_.algorithm().is_default()) {
     // Auto-tuning either is disabled or only happens in the first run of this
     // function.
     VLOG(2) << "Profiling for best convolution algorithm used for "
@@ -295,26 +297,32 @@ tensorflow::Status ConvolutionThunk::ConvolveWithTune(
 
     se::dnn::ProfileResult best_result;
     se::dnn::ProfileResult best_result_without_scratch;
-    for (se::dnn::AlgorithmType algorithm : GetAlgorithms(stream->parent())) {
-      ConvolveScratchAllocator scratch_allocator(
-          buffer_allocations.device_ordinal(),
-          buffer_allocations.memory_allocator());
-      se::dnn::ProfileResult profile_result;
-      bool launch_ok =
-          Convolve(input_descriptor, input_data, filter_descriptor, filter_data,
-                   output_descriptor, output_data, convolution_descriptor,
-                   se::dnn::AlgorithmConfig(algorithm, algorithm), stream,
-                   &scratch_allocator, &profile_result)
-              .ok();
-      if (launch_ok && profile_result.is_valid()) {
-        if (profile_result.elapsed_time_in_ms() <
-            best_result.elapsed_time_in_ms()) {
-          best_result = profile_result;
-        }
-        if (scratch_allocator.TotalAllocatedBytes() == 0 &&
-            profile_result.elapsed_time_in_ms() <
-                best_result_without_scratch.elapsed_time_in_ms()) {
-          best_result_without_scratch = profile_result;
+    std::vector<AlgorithmDesc::Index> algorithms =
+        GetAlgorithms(stream->parent());
+    for (bool use_tensor_ops : {false, true}) {
+      for (auto algo_index : algorithms) {
+        AlgorithmDesc algorithm(algo_index, use_tensor_ops);
+        ConvolveScratchAllocator scratch_allocator(
+            buffer_allocations.device_ordinal(),
+            buffer_allocations.memory_allocator());
+        se::dnn::ProfileResult profile_result;
+        bool launch_ok =
+            Convolve(input_descriptor, input_data, filter_descriptor,
+                     filter_data, output_descriptor, output_data,
+                     convolution_descriptor,
+                     se::dnn::AlgorithmConfig(algorithm, algorithm), stream,
+                     &scratch_allocator, &profile_result)
+                .ok();
+        if (launch_ok && profile_result.is_valid()) {
+          if (profile_result.elapsed_time_in_ms() <
+              best_result.elapsed_time_in_ms()) {
+            best_result = profile_result;
+          }
+          if (scratch_allocator.TotalAllocatedBytes() == 0 &&
+              profile_result.elapsed_time_in_ms() <
+                  best_result_without_scratch.elapsed_time_in_ms()) {
+            best_result_without_scratch = profile_result;
+          }
         }
       }
     }
@@ -324,7 +332,7 @@ tensorflow::Status ConvolutionThunk::ConvolveWithTune(
     } else {
       LOG(ERROR) << "No convolution algorithm works with profiling. Fall back "
                     "to the default algorithm.";
-      best_algorithm_.set_algorithm(se::dnn::kDefaultAlgorithm);
+      best_algorithm_.set_algorithm(AlgorithmDesc());
     }
 
     if (best_result_without_scratch.is_valid()) {
@@ -334,13 +342,14 @@ tensorflow::Status ConvolutionThunk::ConvolveWithTune(
       LOG(ERROR) << "No convolution algorithm without scratch works with "
                     "profiling. Fall back "
                     "to the default algorithm.";
-      best_algorithm_.set_algorithm_no_scratch(se::dnn::kDefaultAlgorithm);
+      best_algorithm_.set_algorithm_no_scratch(AlgorithmDesc());
     }
   }
 
   {
-    VLOG(2) << "Using convolution algorithm (" << best_algorithm_.algorithm()
-            << ", " << best_algorithm_.algorithm_no_scratch()
+    VLOG(2) << "Using convolution algorithm ("
+            << best_algorithm_.algorithm().algo_id() << ", "
+            << best_algorithm_.algorithm_no_scratch().algo_id()
             << ") for ConvolutionThunk: " << this;
     ConvolveScratchAllocator scratch_allocator(
         buffer_allocations.device_ordinal(),
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
index 91d6df299d..509719c1fe 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
@@ -115,7 +115,9 @@ class ConvolutionThunk : public Thunk {
       perftools::gputools::dnn::ProfileResult* profile_result);
 
   // Returns the convolve algorithms that can be used for this ConvolutionThunk.
-  std::vector<perftools::gputools::dnn::AlgorithmType> GetAlgorithms(
+  // TODO(nluehr) GetAlgorithms should return AlgorithmDesc including both
+  // tensor-op and non-tensor-op variants.
+  std::vector<perftools::gputools::dnn::AlgorithmDesc::Index> GetAlgorithms(
       perftools::gputools::StreamExecutor* stream_exec) const;
 
   // Fastest cuDNN convolution algorithm for this thunk learned from
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index f4187cdade..958408e875 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -544,9 +544,9 @@ bool AreShapesForTranspose021(const Shape& a, const Shape& b) {
 
 // Emits a tiled 0-2-1 transpose, assuming both input and output lain out from
 // major to minor. The x- and y- dimensions are tiled in square tiles of edge
-// length `tile_size`. Each thread block of `tile_size` threads transposes one
-// tile: each thread copies a row from the input to a shared memory tile, then
-// copies a column from the shared memory tile to the output.
+// length `tile_size`. Each thread block of `tile_size` x `num_rows` threads
+// transposes one tile: each thread copies a row from the input to a shared
+// memory tile, then copies a column from the shared memory tile to the output.
 //
 // `tile_size` should usually be same as warp size.
 //
@@ -557,9 +557,10 @@ bool AreShapesForTranspose021(const Shape& a, const Shape& b) {
 //                   in any case, the number of blocks we can launch is limited.
 //
 // This is the same algorithm in CUDA:
-// https://github.com/tensorflow/tensorflow/blob/6172351b81af76d0b819fea6bb478cbd4016d6c2/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc#L183
+// https://github.com/tensorflow/tensorflow/blob/d2693c8a70567cc78b2e8a9ac8020d321620ca83/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc#L189
 int64 EmitTranspose021Tiled(llvm_ir::IrArray input, llvm_ir::IrArray output,
-                            const int64 tile_size, llvm::IRBuilder<>* builder) {
+                            const int64 tile_size, const int64 num_rows,
+                            llvm::IRBuilder<>* builder) {
   // Adds `addend` to the given `dim` of `index`.
   auto offset_dim = [builder](llvm_ir::IrArray::Index index,
                               llvm::Value* addend, int64 dim) {
@@ -590,18 +591,29 @@ int64 EmitTranspose021Tiled(llvm_ir::IrArray input, llvm_ir::IrArray output,
   // let x = threadIdx.x
   llvm::Value* x = llvm_ir::EmitCallToIntrinsic(
       llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, builder);
-  llvm_ir::AddRangeMetadata(0, tile_size, static_cast<llvm::Instruction*>(x));
+  llvm_ir::AddRangeMetadata(0, num_rows * tile_size,
+                            static_cast<llvm::Instruction*>(x));
   x = builder->CreateIntCast(x, builder->getInt64Ty(), /*isSigned=*/true,
                              "thread.id.x");
 
+  // computing logical thread ids
+  // logical_x = x % tile_size
+  auto logical_x = builder->CreateURem(x, builder->getInt64(tile_size));
+
+  // logical_y = x / tile_size
+  auto logical_y = builder->CreateUDiv(x, builder->getInt64(tile_size));
+
   // `emit_cp` emits equivalent to following pseudocode:
   // if (tile_size == tile_width && tile_size == tile_height) {
-  //   unroll for (y in 0..tile_size) {
-  //     emit_cp_element(index + {0, y, 0}, y);
+  //   unroll for (i in range(0, tile_size, num_rows)) {
+  //     emit_cp_element(index + {0, i, 0}, y + logical_y);
   //   }
   // } else if (x < tile_width) {
-  //   for (y in 0..tile_height) {
-  //     emit_cp_element(index + {0, y, 0}, y);
+  //   tile_height_upperbound = ceil(tile_height / num_rows) * num_rows;
+  //   for (i in range(0, tile_height_upperbound, num_rows)) {
+  //     y_loc = i + logical_y;
+  //     if (y_loc < tile_height)
+  //      emit_cp_element(index + {0, i, 0}, y_loc);
   //   }
   // }
   //
@@ -615,32 +627,50 @@ int64 EmitTranspose021Tiled(llvm_ir::IrArray input, llvm_ir::IrArray output,
   // tile, whether which is row or column is a function of whether we're copying
   // from input or to output, and `index` is the index into the input or output
   // array.
-  auto emit_cp_tile = [builder, tile_size, x, &offset_dim](
-      std::function<void(const llvm_ir::IrArray::Index&, llvm::Value*)>
-          emit_cp_element,
-      llvm::Value* tile_width, llvm::Value* tile_height,
-      const llvm_ir::IrArray::Index& index, const string& loop_name) {
+  auto emit_cp_tile = [builder, tile_size, &offset_dim, num_rows, logical_x,
+                       logical_y](
+                          std::function<void(const llvm_ir::IrArray::Index&,
+                                             llvm::Value*)>
+                              emit_cp_element,
+                          llvm::Value* tile_width, llvm::Value* tile_height,
+                          const llvm_ir::IrArray::Index& index,
+                          const string& loop_name) {
     llvm_ir::LlvmIfData if_not_last_row = llvm_ir::EmitIfThenElse(
         builder->CreateAnd(
             builder->CreateICmpEQ(builder->getInt64(tile_size), tile_width),
             builder->CreateICmpEQ(builder->getInt64(tile_size), tile_height)),
         "not_last_row", builder);
     builder->SetInsertPoint(if_not_last_row.true_block->getTerminator());
-    for (int64 i = 0; i < tile_size; ++i) {
-      emit_cp_element(offset_dim(index, builder->getInt64(i), /*dim=*/1),
-                      builder->getInt64(i));
+    for (int64 i = 0; i < tile_size; i += num_rows) {
+      auto source_idx = offset_dim(index, builder->getInt64(i), /*dim=*/1);
+      auto y_loc = builder->CreateAdd(builder->getInt64(i), logical_y);
+      emit_cp_element(source_idx, y_loc);
     }
     builder->SetInsertPoint(if_not_last_row.false_block->getTerminator());
     llvm_ir::LlvmIfData if_in_tile = llvm_ir::EmitIfThenElse(
-        builder->CreateICmpULT(x, tile_width), "in_tile", builder);
+        builder->CreateICmpULT(logical_x, tile_width), "x_in_tile", builder);
     builder->SetInsertPoint(if_in_tile.true_block->getTerminator());
-    auto loop = llvm_ir::ForLoop::EmitForLoop(loop_name, builder->getInt64(0),
-                                              tile_height, builder->getInt64(1),
-                                              builder);
+
+    // tile_height_upper_bound = ceil(tile_height / num_rows) * num_rows
+    auto tile_height_upper_bound = builder->CreateMul(
+        builder->CreateUDiv(
+            builder->CreateAdd(tile_height, builder->getInt64(num_rows - 1)),
+            builder->getInt64(num_rows)),
+        builder->getInt64(num_rows));
+
+    auto loop = llvm_ir::ForLoop::EmitForLoop(
+        loop_name, builder->getInt64(0), tile_height_upper_bound,
+        builder->getInt64(num_rows), builder);
     llvm_ir::SetToFirstInsertPoint(loop->GetHeaderBasicBlock(), builder);
     builder->SetInsertPoint(loop->GetBodyBasicBlock()->getTerminator());
+
+    auto y_loc = builder->CreateAdd(loop->GetIndVarValue(), logical_y);
+    auto if_y_in_tile = llvm_ir::EmitIfThenElse(
+        builder->CreateICmpULT(y_loc, tile_height), "y_in_tile", builder);
+    builder->SetInsertPoint(if_y_in_tile.true_block->getTerminator());
+
     emit_cp_element(offset_dim(index, loop->GetIndVarValue(), /*dim=*/1),
-                    loop->GetIndVarValue());
+                    y_loc);
     builder->SetInsertPoint(if_not_last_row.after_block->getTerminator());
   };
 
@@ -673,7 +703,8 @@ int64 EmitTranspose021Tiled(llvm_ir::IrArray input, llvm_ir::IrArray output,
     index;
   });
   const llvm_ir::IrArray::Index input_index =
-      offset_dim(input_tile_origin, x, /*dim=*/2);
+      offset_dim(offset_dim(input_tile_origin, logical_x, /*dim=*/2), logical_y,
+                 /*dim=*/1);
   std::vector<llvm::Value*> tile_dims(input_shape.dimensions().size());
   // Only last row or column may not have full size.
   for (int i = 1; i < 3; ++i) {
@@ -688,11 +719,11 @@ int64 EmitTranspose021Tiled(llvm_ir::IrArray input, llvm_ir::IrArray output,
   // Load data from input memory to shared memory tile.
   emit_cp_tile(
       // tile[y, x] = input_array[index]
-      [builder, tile, x, &input](const llvm_ir::IrArray::Index& index,
-                                 llvm::Value* y) {
+      [builder, tile, &input, logical_x](const llvm_ir::IrArray::Index& index,
+                                         llvm::Value* y) {
         builder->CreateStore(
             input.EmitReadArrayElement(index, builder, "input_element"),
-            builder->CreateGEP(tile, {builder->getInt64(0), y, x}));
+            builder->CreateGEP(tile, {builder->getInt64(0), y, logical_x}));
       },
       tile_dims[2], tile_dims[1], input_index, "input");
 
@@ -706,17 +737,18 @@ int64 EmitTranspose021Tiled(llvm_ir::IrArray input, llvm_ir::IrArray output,
   const llvm_ir::IrArray::Index output_tile_origin(
       Permute({0, 2, 1}, input_tile_origin.multidim()));
   const llvm_ir::IrArray::Index output_index =
-      offset_dim(output_tile_origin, x, /*dim=*/2);
+      offset_dim(offset_dim(output_tile_origin, logical_x, /*dim=*/2),
+                 logical_y, /*dim=*/1);
 
   // Store data from shared memory tile to output memory.
   emit_cp_tile(
       // output_array[index] = tile[x, y]
-      [builder, tile, x, &output](const llvm_ir::IrArray::Index& index,
-                                  llvm::Value* y) {
+      [builder, tile, &output, logical_x](const llvm_ir::IrArray::Index& index,
+                                          llvm::Value* y) {
         output.EmitWriteArrayElement(
             index,
             builder->CreateLoad(
-                builder->CreateGEP(tile, {builder->getInt64(0), x, y}),
+                builder->CreateGEP(tile, {builder->getInt64(0), logical_x, y}),
                 "output_element"),
             builder);
       },
@@ -742,13 +774,14 @@ Status IrEmitterUnnested::HandleCopy(HloInstruction* copy) {
     thunk_sequence_->emplace_back(BuildKernelThunk(copy));
     VLOG(3) << "Emitting tiled 0-2-1 transposition";
     constexpr int64 tile_size = 32;
+    constexpr int64 num_rows = 8;
     int64 num_tiles = EmitTranspose021Tiled(
         GetIrArray(*(copy->operand(0)))
             .CastToShape(reduced_input_shape, &ir_builder_),
         GetIrArray(*copy).CastToShape(reduced_output_shape, &ir_builder_),
-        tile_size, &ir_builder_);
-    UpdateLaunchDimensions(LaunchDimensions(num_tiles, tile_size), LastThunk(),
-                           ir_emitter_context_->llvm_module());
+        tile_size, num_rows, &ir_builder_);
+    UpdateLaunchDimensions(LaunchDimensions(num_tiles, num_rows * tile_size),
+                           LastThunk(), ir_emitter_context_->llvm_module());
     return Status::OK();
   }
 
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index b669922482..14fa6ea7cd 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -5,6 +5,8 @@ licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
+load("//third_party/mpi:mpi.bzl", "if_mpi")
+
 py_library(
     name = "contrib_py",
     srcs = glob(["**/*.py"]),
@@ -85,7 +87,7 @@ py_library(
         "//tensorflow/contrib/tpu",
         "//tensorflow/contrib/training:training_py",
         "//tensorflow/contrib/util:util_py",
-    ],
+    ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_ops_py"]),
 )
 
 cc_library(
diff --git a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
index f60bd8282c..395dd6c5d2 100644
--- a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
+++ b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
@@ -159,6 +159,22 @@ public class TensorFlowInferenceInterface {
       throw new RuntimeException("Failed to load model from the input stream", e);
     }
   }
+  
+  /*
+   * Construct a TensorFlowInferenceInterface with provided Graph
+   *
+   * @param g The Graph to use to construct this interface.
+   */
+  public TensorFlowInferenceInterface(Graph g) {
+    prepareNativeRuntime();
+      
+    // modelName is redundant here, here is for
+    // avoiding error in initialization as modelName is marked final.
+    this.modelName = "";
+    this.g = g;
+    this.sess = new Session(g);
+    this.runner = sess.runner();
+  }
 
   /**
    * Runs inference between the previously registered input nodes (via feed*) and the requested
diff --git a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc
index f641cb5d6a..9968c9c3bf 100644
--- a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc
+++ b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc
@@ -15,6 +15,8 @@
 #include "tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h"
 #include "tensorflow/core/platform/macros.h"
 
+#include <algorithm>
+
 namespace tensorflow {
 namespace boosted_trees {
 namespace trees {
diff --git a/tensorflow/contrib/cmake/external/farmhash.cmake b/tensorflow/contrib/cmake/external/farmhash.cmake
index 41b0e8c92b..96fade8b53 100644
--- a/tensorflow/contrib/cmake/external/farmhash.cmake
+++ b/tensorflow/contrib/cmake/external/farmhash.cmake
@@ -52,6 +52,7 @@ else()
       CONFIGURE_COMMAND
           ${farmhash_BUILD}/configure
           --prefix=${farmhash_INSTALL}
+          --libdir=${farmhash_INSTALL}/lib
           --enable-shared=yes
           CXXFLAGS=-fPIC)
 
diff --git a/tensorflow/contrib/cmake/external/gif.cmake b/tensorflow/contrib/cmake/external/gif.cmake
index 87e15f6359..5cb719b878 100644
--- a/tensorflow/contrib/cmake/external/gif.cmake
+++ b/tensorflow/contrib/cmake/external/gif.cmake
@@ -66,6 +66,7 @@ else()
           ${CMAKE_CURRENT_BINARY_DIR}/gif/src/gif/configure
           --with-pic
           --prefix=${gif_INSTALL}
+          --libdir=${gif_INSTALL}/lib
          --enable-shared=yes
   )
 
diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index b06755afc2..464aad74c6 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -28,7 +28,8 @@ else()
   set(grpc_STATIC_LIBRARIES
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc++_unsecure.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc_unsecure.a
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgpr.a)
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgpr.a
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/third_party/cares/libcares.a)
 endif()
 
 ExternalProject_Add(grpc
@@ -42,6 +43,7 @@ ExternalProject_Add(grpc
     # on "grpc" from the "grpc++_unsecure" rule.
     PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/grpc/CMakeLists.txt ${GRPC_BUILD}
     BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release --target grpc++_unsecure
+    COMMAND ${CMAKE_COMMAND} --build . --config Release --target grpc_cpp_plugin
     INSTALL_COMMAND ""
     CMAKE_CACHE_ARGS
         -DCMAKE_BUILD_TYPE:STRING=Release
diff --git a/tensorflow/contrib/cmake/external/jpeg.cmake b/tensorflow/contrib/cmake/external/jpeg.cmake
index f2797d13b2..ff17b975b9 100644
--- a/tensorflow/contrib/cmake/external/jpeg.cmake
+++ b/tensorflow/contrib/cmake/external/jpeg.cmake
@@ -74,6 +74,7 @@ else()
         CONFIGURE_COMMAND
             ${jpeg_BUILD}/configure
             --prefix=${jpeg_INSTALL}
+            --libdir=${jpeg_INSTALL}/lib
             --enable-shared=yes
 	    CFLAGS=-fPIC
     )
diff --git a/tensorflow/contrib/cmake/external/sqlite.cmake b/tensorflow/contrib/cmake/external/sqlite.cmake
index a8809d4a4a..6fa3a57699 100644
--- a/tensorflow/contrib/cmake/external/sqlite.cmake
+++ b/tensorflow/contrib/cmake/external/sqlite.cmake
@@ -23,7 +23,7 @@ set(sqlite_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/sqlite/install)
 if(WIN32)
   set(sqlite_STATIC_LIBRARIES ${sqlite_INSTALL}/lib/sqlite.lib)
 else()
-  set(sqlite_STATIC_LIBRARIES ${sqlite_INSTALL}/lib/sqlite.a)
+  set(sqlite_STATIC_LIBRARIES ${sqlite_INSTALL}/lib/libsqlite.a)
 endif()
 
 set(sqlite_HEADERS
@@ -49,11 +49,14 @@ else()
         PREFIX sqlite
         URL ${sqlite_URL}
         URL_HASH ${sqlite_HASH}
+        PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/sqlite/CMakeLists.txt ${sqlite_BUILD}
         INSTALL_DIR ${sqlite_INSTALL}
         DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
-        BUILD_COMMAND $(MAKE)
-        INSTALL_COMMAND $(MAKE) install
-	    CFLAGS=-fPIC
+        CMAKE_CACHE_ARGS
+            -DCMAKE_BUILD_TYPE:STRING=Release
+            -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+            -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+            -DCMAKE_INSTALL_PREFIX:STRING=${sqlite_INSTALL}
     )
 
 endif()
@@ -69,4 +72,4 @@ add_custom_target(sqlite_copy_headers_to_destination
 foreach(header_file ${sqlite_HEADERS})
     add_custom_command(TARGET sqlite_copy_headers_to_destination PRE_BUILD
     COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header_file} ${sqlite_INCLUDE_DIR})
-endforeach()
-\ No newline at end of file
+endforeach()
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index 53d6413310..1b64a52ece 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -49,6 +49,44 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS HDRS ROOT_DIR)
   set(${HDRS} ${${HDRS}} PARENT_SCOPE)
 endfunction()
 
+if(NOT WIN32)
+  function(RELATIVE_PROTOBUF_GENERATE_GRPC_CPP SRCS HDRS ROOT_DIR)
+    if(NOT ARGN)
+      message(SEND_ERROR "Error: RELATIVE_PROTOBUF_GENERATE_GRPC_CPP() called without any proto files")
+      return()
+    endif()
+
+    set(${SRCS})
+    set(${HDRS})
+    foreach(FIL ${ARGN})
+      set(ABS_FIL ${ROOT_DIR}/${FIL})
+      get_filename_component(FIL_WE ${FIL} NAME_WE)
+      get_filename_component(FIL_DIR ${ABS_FIL} PATH)
+      file(RELATIVE_PATH REL_DIR ${ROOT_DIR} ${FIL_DIR})
+
+      list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${REL_DIR}/${FIL_WE}.grpc.pb.cc")
+      list(APPEND ${HDRS} "${CMAKE_CURRENT_BINARY_DIR}/${REL_DIR}/${FIL_WE}.grpc.pb.h")
+      list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${REL_DIR}/${FIL_WE}.pb.cc")
+      list(APPEND ${HDRS} "${CMAKE_CURRENT_BINARY_DIR}/${REL_DIR}/${FIL_WE}.pb.h")
+
+      add_custom_command(
+        OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${REL_DIR}/${FIL_WE}.grpc.pb.cc"
+               "${CMAKE_CURRENT_BINARY_DIR}/${REL_DIR}/${FIL_WE}.grpc.pb.h"
+               "${CMAKE_CURRENT_BINARY_DIR}/${REL_DIR}/${FIL_WE}.pb.cc"
+               "${CMAKE_CURRENT_BINARY_DIR}/${REL_DIR}/${FIL_WE}.pb.h"
+        COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+        ARGS --grpc_out ${CMAKE_CURRENT_BINARY_DIR} --cpp_out ${CMAKE_CURRENT_BINARY_DIR} --plugin protoc-gen-grpc=${GRPC_BUILD}/grpc_cpp_plugin -I ${ROOT_DIR} ${ABS_FIL} -I ${PROTOBUF_INCLUDE_DIRS}
+        DEPENDS ${ABS_FIL} protobuf grpc
+        COMMENT "Running C++ protocol buffer grpc compiler on ${FIL}"
+        VERBATIM )
+    endforeach()
+
+    set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
+    set(${SRCS} ${${SRCS}} PARENT_SCOPE)
+    set(${HDRS} ${${HDRS}} PARENT_SCOPE)
+  endfunction()
+endif()
+
 function(RELATIVE_PROTOBUF_TEXT_GENERATE_CPP SRCS HDRS ROOT_DIR)
   if(NOT ARGN)
       message(SEND_ERROR "Error: RELATIVE_PROTOBUF_TEXT_GENERATE_CPP() called without any proto files")
@@ -93,6 +131,7 @@ RELATIVE_PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS
     ${tensorflow_source_dir} ${tf_protos_cc_srcs}
 )
 
+
 set(PROTO_TEXT_EXE "proto_text")
 set(tf_proto_text_srcs
     "tensorflow/core/example/example.proto"
@@ -133,7 +172,17 @@ RELATIVE_PROTOBUF_TEXT_GENERATE_CPP(PROTO_TEXT_SRCS PROTO_TEXT_HDRS
     ${tensorflow_source_dir} ${tf_proto_text_srcs}
 )
 
-add_library(tf_protos_cc ${PROTO_SRCS} ${PROTO_HDRS})
+if(WIN32)
+  add_library(tf_protos_cc ${PROTO_SRCS} ${PROTO_HDRS})
+else()
+  file(GLOB_RECURSE tf_protos_grpc_cc_srcs RELATIVE ${tensorflow_source_dir}
+      "${tensorflow_source_dir}/tensorflow/core/debug/*.proto"
+  )
+  RELATIVE_PROTOBUF_GENERATE_GRPC_CPP(PROTO_GRPC_SRCS PROTO_GRPC_HDRS
+      ${tensorflow_source_dir} ${tf_protos_grpc_cc_srcs}
+  )
+  add_library(tf_protos_cc ${PROTO_GRPC_SRCS} ${PROTO_GRPC_HDRS} ${PROTO_SRCS} ${PROTO_HDRS})
+endif()
 
 ########################################################
 # tf_core_lib library
diff --git a/tensorflow/contrib/crf/README.md b/tensorflow/contrib/crf/README.md
index a184e321bb..b58cc2dd04 100644
--- a/tensorflow/contrib/crf/README.md
+++ b/tensorflow/contrib/crf/README.md
@@ -46,31 +46,25 @@ with tf.Graph().as_default():
     log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
         unary_scores, y_t, sequence_lengths_t)
 
+    # Compute the viterbi sequence and score.
+    viterbi_sequence, viterbi_score = tf.contrib.crf.crf_decode(
+        unary_scores, transition_params, sequence_lengths_t)
+
     # Add a training op to tune the parameters.
     loss = tf.reduce_mean(-log_likelihood)
     train_op = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
 
-    # Train for a fixed number of iterations.
     session.run(tf.global_variables_initializer())
+
+    mask = (np.expand_dims(np.arange(num_words), axis=0) <
+            np.expand_dims(sequence_lengths, axis=1))
+    total_labels = np.sum(sequence_lengths)
+
+    # Train for a fixed number of iterations.
     for i in range(1000):
-      tf_unary_scores, tf_transition_params, _ = session.run(
-          [unary_scores, transition_params, train_op])
+      tf_viterbi_sequence, _ = session.run([viterbi_sequence, train_op])
       if i % 100 == 0:
-        correct_labels = 0
-        total_labels = 0
-        for tf_unary_scores_, y_, sequence_length_ in zip(tf_unary_scores, y,
-                                                          sequence_lengths):
-          # Remove padding from the scores and tag sequence.
-          tf_unary_scores_ = tf_unary_scores_[:sequence_length_]
-          y_ = y_[:sequence_length_]
-
-          # Compute the highest scoring sequence.
-          viterbi_sequence, _ = tf.contrib.crf.viterbi_decode(
-              tf_unary_scores_, tf_transition_params)
-
-          # Evaluate word-level accuracy.
-          correct_labels += np.sum(np.equal(viterbi_sequence, y_))
-          total_labels += sequence_length_
+        correct_labels = np.sum((y == tf_viterbi_sequence) * mask)
         accuracy = 100.0 * correct_labels / float(total_labels)
         print("Accuracy: %.2f%%" % accuracy)
 ```
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index 2d7407980f..9275d5a22b 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -493,37 +493,42 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
   dnn::AlgorithmConfig algorithm_config;
   if (cudnn_use_autotune && !AutoTuneConvBiasActivation::GetInstance()->Find(
                                 fused_conv_parameters, &algorithm_config)) {
-    std::vector<dnn::AlgorithmType> algorithms;
+    std::vector<dnn::AlgorithmDesc::Index> algorithms;
     CHECK(stream->parent()->GetConvolveAlgorithms(
         fused_conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(),
         &algorithms));
     dnn::ProfileResult best_result;
     dnn::ProfileResult best_result_no_scratch;
-    for (auto profile_algorithm : algorithms) {
-      // TODO(zhengxq): profile each algorithm multiple times to better
-      // accuracy.
-      CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
-      dnn::ProfileResult profile_result;
-      bool cudnn_launch_status =
-          stream
-              ->ThenFusedConvolveWithAlgorithm(
-                  conv_input_desc, conv_input_ptr, conv_input_scale,
-                  filter_desc, filter_ptr, conv_desc, side_input_ptr,
-                  side_input_scale, bias_desc, bias_ptr,
-                  dnn::ActivationMode::kRelu, output_desc, &output_ptr,
-                  &scratch_allocator, dnn::AlgorithmConfig(profile_algorithm),
-                  &profile_result)
-              .ok();
-      if (cudnn_launch_status) {
-        if (profile_result.is_valid()) {
-          if (profile_result.elapsed_time_in_ms() <
-              best_result.elapsed_time_in_ms()) {
-            best_result = profile_result;
-          }
-          if (scratch_allocator.TotalByteSize() == 0 &&
-              profile_result.elapsed_time_in_ms() <
-                  best_result_no_scratch.elapsed_time_in_ms()) {
-            best_result_no_scratch = profile_result;
+    // TODO(benbarsdell): Ideally this should not attempt using tensor op math
+    // if it's not enabled.
+    for (bool use_tensor_ops : {false, true}) {
+      for (auto algo_index : algorithms) {
+        // TODO(zhengxq): profile each algorithm multiple times to better
+        // accuracy.
+        dnn::AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
+        CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+        dnn::ProfileResult profile_result;
+        bool cudnn_launch_status =
+            stream
+                ->ThenFusedConvolveWithAlgorithm(
+                    conv_input_desc, conv_input_ptr, conv_input_scale,
+                    filter_desc, filter_ptr, conv_desc, side_input_ptr,
+                    side_input_scale, bias_desc, bias_ptr,
+                    dnn::ActivationMode::kRelu, output_desc, &output_ptr,
+                    &scratch_allocator, dnn::AlgorithmConfig(profile_algorithm),
+                    &profile_result)
+                .ok();
+        if (cudnn_launch_status) {
+          if (profile_result.is_valid()) {
+            if (profile_result.elapsed_time_in_ms() <
+                best_result.elapsed_time_in_ms()) {
+              best_result = profile_result;
+            }
+            if (scratch_allocator.TotalByteSize() == 0 &&
+                profile_result.elapsed_time_in_ms() <
+                    best_result_no_scratch.elapsed_time_in_ms()) {
+              best_result_no_scratch = profile_result;
+            }
           }
         }
       }
diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc
index c55989e3e5..5c7ac74428 100644
--- a/tensorflow/contrib/gdr/gdr_memory_manager.cc
+++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc
@@ -125,13 +125,15 @@ class GdrMemoryManager : public RemoteMemoryManager {
 
   virtual void Stop() override;
 
-  virtual Status TransportOptionsFromTensor(
+  virtual void TransportOptionsFromTensor(
       ::google::protobuf::Any* mutable_transport_options, const Tensor& tensor,
-      Device* device, DeviceContext* device_context, bool on_host) override;
+      Device* device, DeviceContext* device_context, bool on_host,
+      StatusCallback done) override;
 
-  virtual Status TensorFromTransportOptions(
+  virtual void TensorFromTransportOptions(
       Tensor* tensor, const ::google::protobuf::Any& transport_options,
-      Device* device, DeviceContext* device_context, bool on_host) override;
+      Device* device, DeviceContext* device_context, bool on_host,
+      StatusCallback done) override;
 
  protected:
   Status CreateEndpoint(const string& host, const string& port,
@@ -145,10 +147,6 @@ class GdrMemoryManager : public RemoteMemoryManager {
 
   void InsertMemoryRegion(void* addr, size_t length);
 
-#if GOOGLE_CUDA
-  void InsertCUDAMemoryRegion(void* addr, size_t length);
-#endif
-
   void EvictMemoryRegion(void* addr, size_t length);
 
  private:
@@ -415,45 +413,74 @@ void GdrMemoryManager::Run() {
 
 void GdrMemoryManager::Stop() { stopped_ = true; }
 
-Status GdrMemoryManager::TransportOptionsFromTensor(
+void GdrMemoryManager::TransportOptionsFromTensor(
     ::google::protobuf::Any* mutable_transport_options, const Tensor& tensor,
-    Device* device, DeviceContext* device_context, bool on_host) {
+    Device* device, DeviceContext* device_context, bool on_host,
+    StatusCallback done) {
   auto buffer = DMAHelper::buffer(&tensor);
   void* addr = buffer->data();
   size_t length = buffer->size();
   if (length == 0) {
-    return errors::Unavailable("Cannot register tensor buffer of size 0");
+    done(errors::Unavailable("Cannot register tensor buffer of size 0"));
+    return;
   }
 
   ibv_mr* mr = FindMemoryRegion(addr, length);
 
-  Tensor host_copy;
 #if GOOGLE_CUDA
-  if (!on_host && mr != nullptr) {
-    TF_RETURN_IF_ERROR(GPUUtil::Sync(device));
-  } else if (!on_host) {
+  if (!on_host) {
     Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
-    host_copy = Tensor(alloc, tensor.dtype(), tensor.shape());
-    Status s;
-    Notification n;
-    GPUUtil::CopyGPUTensorToCPU(device, device_context, &tensor, &host_copy,
-                                [&s, &n](const Status& status) {
-                                  s.Update(status);
-                                  n.Notify();
-                                });
-    n.WaitForNotification();
-    if (!s.ok()) {
-      return s;
-    }
-    buffer = DMAHelper::buffer(&host_copy);
-    addr = buffer->data();
-    length = buffer->size();
-    mr = FindMemoryRegion(addr, length);
+    Tensor* host_copy = new Tensor(alloc, tensor.dtype(), tensor.shape());
+    GPUUtil::CopyGPUTensorToCPU(
+        device, device_context, &tensor, host_copy,
+        [done, host_copy, mutable_transport_options, this](const Status& s) {
+          if (!s.ok()) {
+            done(s);
+            delete host_copy;
+            return;
+          }
+          auto buffer = DMAHelper::buffer(host_copy);
+          void* addr = buffer->data();
+          size_t length = buffer->size();
+          ibv_mr* mr = FindMemoryRegion(addr, length);
+
+          if (mr == nullptr) {
+            done(errors::Unavailable("Cannot find pinned memory region"));
+            delete host_copy;
+            return;
+          }
+
+          buffer->Ref();
+          TensorKey tensor_key = next_key_++;
+          {
+            mutex_lock l(server_mu_);
+            tensor_buffers_.insert(std::make_pair(tensor_key, buffer));
+          }
+
+          uint64_t checksum = 0;
+          if (VLOG_IS_ON(2)) {
+            checksum = GPUUtil::Checksum(*host_copy);
+          }
+
+          RemoteMemoryRegion remote_mr;
+          remote_mr.set_host(host_);
+          remote_mr.set_port(port_);
+          remote_mr.set_addr(reinterpret_cast<uint64_t>(addr));
+          remote_mr.set_rkey(mr->rkey);
+          remote_mr.set_tensor_key(tensor_key);
+          remote_mr.set_checksum(checksum);
+          mutable_transport_options->PackFrom(remote_mr);
+
+          done(Status::OK());
+          delete host_copy;
+        });
+    return;
   }
 #endif
 
   if (mr == nullptr) {
-    return errors::Unavailable("Cannot find pinned memory region");
+    done(errors::Unavailable("Cannot find pinned memory region"));
+    return;
   }
 
   buffer->Ref();
@@ -466,12 +493,8 @@ Status GdrMemoryManager::TransportOptionsFromTensor(
   uint64_t checksum = 0;
   if (VLOG_IS_ON(2)) {
 #ifdef GOOGLE_CUDA
-    if (device->tensorflow_gpu_device_info() && (!on_host)) {
-      if (host_copy.NumElements() > 0) {
-        checksum = GPUUtil::Checksum(device, device_context, host_copy);
-      } else {
-        checksum = GPUUtil::Checksum(device, device_context, tensor);
-      }
+    if (!on_host) {
+      checksum = GPUUtil::Checksum(device, device_context, tensor);
     } else {
       checksum = GPUUtil::Checksum(tensor);
     }
@@ -487,15 +510,17 @@ Status GdrMemoryManager::TransportOptionsFromTensor(
   remote_mr.set_checksum(checksum);
   mutable_transport_options->PackFrom(remote_mr);
 
-  return Status::OK();
+  done(Status::OK());
 }
 
-Status GdrMemoryManager::TensorFromTransportOptions(
+void GdrMemoryManager::TensorFromTransportOptions(
     Tensor* tensor, const ::google::protobuf::Any& transport_options,
-    Device* device, DeviceContext* device_context, bool on_host) {
+    Device* device, DeviceContext* device_context, bool on_host,
+    StatusCallback done) {
   RemoteMemoryRegion remote_mr;
   if (!transport_options.UnpackTo(&remote_mr)) {
-    return errors::NotFound("No RDMA transport options found");
+    done(errors::NotFound("No RDMA transport options found"));
+    return;
   }
 
   auto buffer = DMAHelper::buffer(tensor);
@@ -505,9 +530,7 @@ Status GdrMemoryManager::TensorFromTransportOptions(
 
   Tensor host_copy;
 #if GOOGLE_CUDA
-  if (!on_host && mr != nullptr) {
-    TF_RETURN_IF_ERROR(GPUUtil::Sync(device));
-  } else if (!on_host) {
+  if (mr == nullptr && !on_host) {
     Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
     host_copy = Tensor(alloc, tensor->dtype(), tensor->shape());
     buffer = DMAHelper::buffer(&host_copy);
@@ -518,7 +541,8 @@ Status GdrMemoryManager::TensorFromTransportOptions(
 #endif  // GOOGLE_CUDA
 
   if (mr == nullptr) {
-    return errors::Unavailable("Cannot find pinned memory region");
+    done(errors::Unavailable("Cannot find pinned memory region"));
+    return;
   }
 
   decltype(clients_)::iterator iter;
@@ -529,8 +553,12 @@ Status GdrMemoryManager::TensorFromTransportOptions(
         std::make_pair(std::make_pair(remote_mr.host(), remote_mr.port()),
                        RdmaEndpointPtr(nullptr, EndpointDeleter)));
     if (success || iter->second.get() == nullptr) {
-      TF_RETURN_IF_ERROR(
-          CreateEndpoint(remote_mr.host(), remote_mr.port(), iter->second));
+      Status s =
+          CreateEndpoint(remote_mr.host(), remote_mr.port(), iter->second);
+      if (!s.ok()) {
+        done(s);
+        return;
+      }
     }
   }
   rdma_cm_id* id = iter->second.get();
@@ -539,37 +567,57 @@ Status GdrMemoryManager::TensorFromTransportOptions(
 
   if (rdma_post_read(id, nullptr, buffer->data(), buffer->size(), mr, 0,
                      remote_mr.addr(), remote_mr.rkey())) {
-    return errors::Unavailable(strerror(errno), ": ", "rdma_post_read failed");
+    done(errors::Unavailable(strerror(errno), ": ", "rdma_post_read failed"));
+    return;
   }
 
   ibv_send_wr wr = {};
   wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
   wr.imm_data = htonl(remote_mr.tensor_key());
-  wr.send_flags = IBV_SEND_FENCE | IBV_SEND_SIGNALED;
+  wr.send_flags = IBV_SEND_SIGNALED;
   ibv_send_wr* bad_wr;
   if (ibv_post_send(id->qp, &wr, &bad_wr)) {
-    return errors::Unavailable(strerror(errno), ": ", "ibv_post_send failed");
+    done(errors::Unavailable(strerror(errno), ": ", "ibv_post_send failed"));
+    return;
   }
 
   ibv_wc wc = {};
-  int ret = rdma_get_send_comp(id, &wc);
+  int ret;
+  while ((ret = ibv_poll_cq(id->send_cq, 1, &wc)) == 0)
+    ;
   if (ret < 0 || wc.status) {
-    return errors::Unavailable(ibv_wc_status_str(wc.status));
+    done(errors::Unavailable(ibv_wc_status_str(wc.status)));
+    return;
   }
 
 #if GOOGLE_CUDA
   if (host_copy.NumElements() > 0) {
-    Status s;
-    Notification n;
-    GPUUtil::CopyCPUTensorToGPU(&host_copy, device_context, device, tensor,
-                                [&s, &n](const Status& status) {
-                                  s.Update(status);
-                                  n.Notify();
-                                });
-    n.WaitForNotification();
-    if (!s.ok()) {
-      return s;
+    uint64_t checksum = 0;
+    if (VLOG_IS_ON(2)) {
+      checksum = GPUUtil::Checksum(host_copy);
+      CHECK(checksum == remote_mr.checksum())
+          << "Checksum mismatch: " << checksum << "!=" << remote_mr.checksum();
     }
+    Tensor* ref = new Tensor;
+    std::swap(host_copy, *ref);
+    GPUUtil::CopyCPUTensorToGPU(
+        ref, device_context, device, tensor,
+        [ref, done, buffer, remote_mr, start](const Status& s) {
+          if (!s.ok()) {
+            done(s);
+            delete ref;
+            return;
+          }
+          uint64_t end = Env::Default()->NowMicros();
+
+          VLOG(2) << "RDMA from remote memory region " << remote_mr.rkey()
+                  << " of size " << buffer->size() << " with tensor key "
+                  << remote_mr.tensor_key() << " took " << (end - start)
+                  << " micros";
+          done(Status::OK());
+          delete ref;
+        });
+    return;
   }
 #endif  // GOOGLE_CUDA
 
@@ -583,11 +631,7 @@ Status GdrMemoryManager::TensorFromTransportOptions(
   if (VLOG_IS_ON(2)) {
 #ifdef GOOGLE_CUDA
     if (device->tensorflow_gpu_device_info() && (!on_host)) {
-      if (host_copy.NumElements() > 0) {
-        checksum = GPUUtil::Checksum(device, device_context, host_copy);
-      } else {
-        checksum = GPUUtil::Checksum(device, device_context, *tensor);
-      }
+      checksum = GPUUtil::Checksum(device, device_context, *tensor);
     } else {
       checksum = GPUUtil::Checksum(*tensor);
     }
@@ -595,7 +639,7 @@ Status GdrMemoryManager::TensorFromTransportOptions(
                                             << "!=" << remote_mr.checksum();
 #endif
   }
-  return Status::OK();
+  done(Status::OK());
 }
 
 Status GdrMemoryManager::CreateEndpoint(const string& host, const string& port,
diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.h b/tensorflow/contrib/gdr/gdr_memory_manager.h
index e0e2a3f624..9ac1aa96c4 100644
--- a/tensorflow/contrib/gdr/gdr_memory_manager.h
+++ b/tensorflow/contrib/gdr/gdr_memory_manager.h
@@ -39,15 +39,17 @@ class RemoteMemoryManager {
 
   // Encodes the tensor information to an arbitrary protocol buffer
   // The protocol buffer needs to be transmitted via some other channel
-  virtual Status TransportOptionsFromTensor(
+  virtual void TransportOptionsFromTensor(
       ::google::protobuf::Any* mutable_transport_options, const Tensor& tensor,
-      Device* device, DeviceContext* device_context, bool on_host) = 0;
+      Device* device, DeviceContext* device_context, bool on_host,
+      StatusCallback done) = 0;
 
   // Retrieve the tensor from the encoded protocol buffer
   // Note that the tensor has to be allocated, but not initialized
-  virtual Status TensorFromTransportOptions(
+  virtual void TensorFromTransportOptions(
       Tensor* tensor, const ::google::protobuf::Any& transport_options,
-      Device* device, DeviceContext* device_context, bool on_host) = 0;
+      Device* device, DeviceContext* device_context, bool on_host,
+      StatusCallback done) = 0;
 };
 
 RemoteMemoryManager* CreateRemoteMemoryManager(const string& host,
diff --git a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
index 259ee8817d..adef2aac33 100644
--- a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
+++ b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
@@ -61,16 +61,20 @@ class GdrRecvTensorCall : public BaseRecvTensorCall {
         const bool on_host =
             (dst_device_->tensorflow_gpu_device_info() == nullptr) ||
             recv_args_.alloc_attrs.on_host();
-        Status s = remote_memory_manager_->TensorFromTransportOptions(
+        remote_memory_manager_->TensorFromTransportOptions(
             const_cast<Tensor*>(&tensor()), transport_options, dst_device_,
-            recv_args_.device_context, on_host);
-        if (!s.ok()) {
-          mutex_lock l(mu_);
-          status_.Update(s);
-          LOG(ERROR)
-              << "Cannot find pinned memory region from allocator "
-              << dst_device_->GetAllocator(recv_args_.alloc_attrs)->Name();
-        }
+            recv_args_.device_context, on_host,
+            [this, recv_done](const Status& s) {
+              if (!s.ok()) {
+                mutex_lock l(mu_);
+                status_.Update(s);
+                LOG(ERROR) << "Cannot find pinned memory region from allocator "
+                           << dst_device_->GetAllocator(recv_args_.alloc_attrs)
+                                  ->Name();
+              }
+              recv_done();
+            });
+        return;
       }
       if (!s.ok()) {
         mutex_lock l(mu_);
diff --git a/tensorflow/contrib/gdr/gdr_worker.cc b/tensorflow/contrib/gdr/gdr_worker.cc
index 0bff0aff6d..5686412347 100644
--- a/tensorflow/contrib/gdr/gdr_worker.cc
+++ b/tensorflow/contrib/gdr/gdr_worker.cc
@@ -86,24 +86,25 @@ void GdrWorker::GrpcRecvTensorAsync(CallOptions* opts,
           if (val.TotalBytes() > 0 && (!is_dead) &&
               DMAHelper::CanUseDMA(&val) && dma_ok) {
             // DMA cases.
-            RecvTensorResponse proto;
-            auto transport_options = proto.mutable_transport_options();
-            Status s = remote_memory_manager_->TransportOptionsFromTensor(
+            RecvTensorResponse* proto = new RecvTensorResponse;
+            proto->set_is_dead(is_dead);
+            proto->set_send_start_micros(Env::Default()->NowMicros());
+            TensorProto* tensor_proto = proto->mutable_tensor();
+            tensor_proto->set_dtype(val.dtype());
+            val.shape().AsProto(tensor_proto->mutable_tensor_shape());
+            auto transport_options = proto->mutable_transport_options();
+            remote_memory_manager_->TransportOptionsFromTensor(
                 transport_options, val, src_dev, send_args.device_context,
-                on_host);
-            if (s.ok()) {
-              proto.set_is_dead(is_dead);
-              proto.set_send_start_micros(Env::Default()->NowMicros());
-              TensorProto* tensor_proto = proto.mutable_tensor();
-              tensor_proto->set_dtype(val.dtype());
-              val.shape().AsProto(tensor_proto->mutable_tensor_shape());
-              grpc::EncodeRecvTensorResponseToByteBuffer(proto, response);
-              done(Status::OK());
-              return;
-            } else {
-              done(s);
-              return;
-            }
+                on_host, [proto, done, response](const Status& s) {
+                  if (s.ok()) {
+                    grpc::EncodeRecvTensorResponseToByteBuffer(*proto,
+                                                               response);
+                    done(Status::OK());
+                  } else {
+                    done(s);
+                  }
+                  delete proto;
+                });
           } else {
             // Non-DMA cases.
             if (src_dev->tensorflow_gpu_device_info() && (!on_host)) {
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/core.py b/tensorflow/contrib/labeled_tensor/python/ops/core.py
index fc1ea83449..abc18aa123 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/core.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/core.py
@@ -278,7 +278,7 @@ class LabeledTensor(object):
   @tc.accepts(object, ops.Tensor,
               tc.Union(Axes, tc.Collection(tc.Union(string_types, AxisLike))))
   def __init__(self, tensor, axes):
-    """Construct a LabeledTenor.
+    """Construct a LabeledTensor.
 
     Args:
       tensor: The underlying tensor containing the data.
diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py
index 3d890987f2..d8ab7c2d70 100644
--- a/tensorflow/contrib/layers/__init__.py
+++ b/tensorflow/contrib/layers/__init__.py
@@ -91,6 +91,7 @@ See the @{$python/contrib.layers} guide.
 @@sparse_column_with_hash_bucket
 @@sparse_column_with_integerized_feature
 @@sparse_column_with_keys
+@@sparse_column_with_vocabulary_file
 @@weighted_sparse_column
 @@weighted_sum_from_feature_columns
 @@infer_real_valued_columns
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 9c85ed8c19..a5da0289f4 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -2600,7 +2600,7 @@ def spatial_softmax(features,
 
   Read more here:
   "Learning visual feature spaces for robotic manipulation with
-  deep spatial autoencoders." Finn et. al, http://arxiv.org/abs/1509.06113.
+  deep spatial autoencoders." Finn et al., http://arxiv.org/abs/1509.06113.
 
   Args:
     features: A `Tensor` of size [batch_size, W, H, num_channels]; the
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index d0e9230698..02237f3058 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -768,7 +768,7 @@ py_test(
         ":learn",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/session_bundle:exporter",
-        "//tensorflow/contrib/session_bundle:manifest_proto_py_pb2",
+        "//tensorflow/contrib/session_bundle:manifest_proto_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index b7b4883a91..234d731850 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -418,6 +418,7 @@ class BaseEstimator(
             "model_dir are set both in constructor and RunConfig, but with "
             "different values. In constructor: '{}', in RunConfig: "
             "'{}' ".format(model_dir, self._config.model_dir))
+        # pylint: enable=g-doc-exception
 
     self._model_dir = model_dir or self._config.model_dir
     if self._model_dir is None:
diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
index 80785a987f..9b55826e62 100644
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -327,15 +327,20 @@ class Experiment(object):
     # Otherwise, the servers will wait to connect to each other before starting
     # to train. We might as well start as soon as we can.
     config = self._estimator.config
-    if (config.cluster_spec and config.master and
-        config.environment == run_config.Environment.LOCAL):
-      logging.warn("ClusterSpec and master are provided, but environment is "
-                   "set to 'local'. Set environment to 'cloud' if you intend "
-                   "to use the distributed runtime.")
-    if (config.environment != run_config.Environment.LOCAL and
-        config.environment != run_config.Environment.GOOGLE and
-        config.cluster_spec and config.master):
-      self._start_server()
+    if isinstance(config, run_config.RunConfig):
+      if (config.cluster_spec and config.master and
+          config.environment == run_config.Environment.LOCAL):
+        logging.warn("ClusterSpec and master are provided, but environment is "
+                     "set to 'local'. Set environment to 'cloud' if you intend "
+                     "to use the distributed runtime.")
+      if (config.environment != run_config.Environment.LOCAL and
+          config.environment != run_config.Environment.GOOGLE and
+          config.cluster_spec and config.master):
+        self._start_server()
+    elif config.cluster_spec and config.master:
+      raise ValueError('For distributed runtime, Experiment class only works with'
+                       'tf.contrib.learn.RunConfig for now, but provided {}'
+                       .format(type(config)))
 
     extra_hooks = []
     if delay_secs is None:
diff --git a/tensorflow/contrib/learn/python/learn/monitors.py b/tensorflow/contrib/learn/python/learn/monitors.py
index 3051f4048f..3e0b1ad21a 100644
--- a/tensorflow/contrib/learn/python/learn/monitors.py
+++ b/tensorflow/contrib/learn/python/learn/monitors.py
@@ -42,9 +42,6 @@ import time
 import numpy as np
 import six
 
-from tensorflow.contrib.framework import deprecated
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables
-from tensorflow.contrib.learn.python.learn.summary_writer_cache import SummaryWriterCache
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.util.event_pb2 import SessionLog
 from tensorflow.python.estimator import estimator as core_estimator
@@ -883,7 +880,7 @@ class GraphDump(BaseMonitor):
 class ExportMonitor(EveryN):
   """Monitor that exports Estimator every N steps."""
 
-  @deprecated("2017-03-25",
+  @deprecation.deprecated("2017-03-25",
               "ExportMonitor is deprecated. Please pass an "
               "ExportStrategy to Experiment instead.")
   def __init__(self,
diff --git a/tensorflow/contrib/makefile/Dockerfile b/tensorflow/contrib/makefile/Dockerfile
index 10a9f80c89..341f22e692 100644
--- a/tensorflow/contrib/makefile/Dockerfile
+++ b/tensorflow/contrib/makefile/Dockerfile
@@ -14,4 +14,5 @@ RUN apt-get install -y \
     make \
     python \
     unzip \
+    wget \
     zlib1g-dev
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index 5cd6cca6a1..e0cfab0b26 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -512,7 +512,6 @@ $(wildcard tensorflow/core/grappler/clusters/single_machine.*)
 # Filter out all the excluded files.
 TF_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
 # Add in any extra files that don't fit the patterns easily
-TF_CC_SRCS += tensorflow/core/platform/default/gpu_tracer.cc
 TF_CC_SRCS += tensorflow/contrib/makefile/downloads/fft2d/fftsg.c
 # Also include the op and kernel definitions.
 TF_CC_SRCS += $(shell cat $(MAKEFILE_DIR)/tf_op_files.txt)
diff --git a/tensorflow/contrib/makefile/build_with_docker.sh b/tensorflow/contrib/makefile/build_with_docker.sh
index 51a73fafe5..4a17576810 100755
--- a/tensorflow/contrib/makefile/build_with_docker.sh
+++ b/tensorflow/contrib/makefile/build_with_docker.sh
@@ -25,30 +25,7 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 WORKSPACE="${SCRIPT_DIR}/../../../"
 cd ${WORKSPACE} || exit 1
 
-DOCKER_IMG_NAME="tf-make-base"
-DOCKER_CONTEXT_PATH="${WORKSPACE}tensorflow/contrib/makefile/"
-DOCKERFILE_PATH="${DOCKER_CONTEXT_PATH}Dockerfile"
-
-# Build the docker image.
-echo "Building image ${DOCKER_IMG_NAME}..."
-docker build -t ${DOCKER_IMG_NAME} \
-    -f "${DOCKERFILE_PATH}" "${DOCKER_CONTEXT_PATH}"
-
-# Check docker build command status.
-if [[ $? != "0" ]]; then
-  echo "ERROR: docker build failed. Dockerfile is at ${DOCKERFILE_PATH}"
-  exit 1
-fi
-
 COMMAND="tensorflow/contrib/makefile/build_all_linux.sh"
 
-# Run the command inside the container.
-echo "Running ${COMMAND} inside ${DOCKER_IMG_NAME}..."
-# By default we cleanup - remove the container once it finish running (--rm)
-# and share the PID namespace (--pid=host) so the process inside does not have
-# pid 1 and SIGKILL is propagated to the process inside (jenkins can kill it).
-docker run --rm --pid=host \
-    -v ${WORKSPACE}:/workspace \
-    -w /workspace \
-    "${DOCKER_IMG_NAME}" \
-    ${COMMAND}
+echo "Running ${COMMAND} inside Android docker image..."
+tensorflow/tools/ci_build/ci_build.sh android ${COMMAND}
diff --git a/tensorflow/contrib/makefile/compile_nsync.sh b/tensorflow/contrib/makefile/compile_nsync.sh
index e85a79c279..ecbd9bb825 100755
--- a/tensorflow/contrib/makefile/compile_nsync.sh
+++ b/tensorflow/contrib/makefile/compile_nsync.sh
@@ -286,7 +286,7 @@ for arch in $archs; do
 
         if [ ! -d "$nsync_platform_dir" ]; then
                 mkdir "$nsync_platform_dir"
-                echo "$makefile" | sed 's,^[ \t]*,,' > "$nsync_platform_dir/Makefile"
+                echo "$makefile" | sed $'s,^[ \t]*,,' > "$nsync_platform_dir/Makefile"
                 touch "$nsync_platform_dir/dependfile"
         fi
         if (cd "$nsync_platform_dir" && make depend nsync.a >&2); then
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 1e9958584c..39c89628d9 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -20,11 +20,11 @@ DOWNLOADS_DIR=tensorflow/contrib/makefile/downloads
 BZL_FILE_PATH=tensorflow/workspace.bzl
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
-GEMMLOWP_URL="$(grep -o 'http.*github.com/google/gemmlowp/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
+GEMMLOWP_URL="$(grep -o 'http://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
-NSYNC_URL="$(grep -o 'http.*github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
-PROTOBUF_URL="$(grep -o 'http.*github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
-RE2_URL="$(grep -o 'http.*github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
+NSYNC_URL="$(grep -o 'http://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
+PROTOBUF_URL="$(grep -o 'http://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
+RE2_URL="$(grep -o 'http://mirror.bazel.build/github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
 
 # TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64,
@@ -49,7 +49,18 @@ download_and_extract() {
   local dir="${2:?${usage}}"
   echo "downloading ${url}" >&2
   mkdir -p "${dir}"
-  curl -Ls "${url}" | tar -C "${dir}" --strip-components=1 -xz
+  if [[ "${url}" == *gz ]]; then
+    curl -Ls "${url}" | tar -C "${dir}" --strip-components=1 -xz
+  elif [[ "${url}" == *zip ]]; then
+    tempdir=$(mktemp -d)
+    tempdir2=$(mktemp -d)
+    wget ${url} -P ${tempdir}
+    unzip ${tempdir}/* -d ${tempdir2}
+    # unzip has no strip components, so unzip to a temp dir, and move the files
+    # we want from the tempdir to destination.
+    cp -R ${tempdir2}/*/* ${dir}/
+    rm -rf ${tempdir2} ${tempdir}
+  fi
 
   # Delete any potential BUILD files, which would interfere with Bazel builds.
   find "${dir}" -type f -name '*BUILD' -delete
diff --git a/tensorflow/contrib/mpi_collectives/BUILD b/tensorflow/contrib/mpi_collectives/BUILD
new file mode 100644
index 0000000000..11c5d6e776
--- /dev/null
+++ b/tensorflow/contrib/mpi_collectives/BUILD
@@ -0,0 +1,80 @@
+# Ops that communicate with other processes via MPI.
+
+package(default_visibility = [
+    "//tensorflow:__subpackages__",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library_cc",
+)
+
+tf_proto_library_cc(
+    name = "mpi_message_proto",
+    srcs = ["mpi_message.proto"],
+    cc_api_version = 2,
+    protodeps = ["//tensorflow/core:protos_all"],
+    visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+)
+
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+tf_custom_op_library(
+    name = "mpi_collectives.so",
+    srcs = [
+        "mpi_ops.cc",
+        "ring.cc",
+        "ring.h",
+    ],
+    gpu_srcs = [
+        "ring.cu.cc",
+        "ring.h",
+    ],
+    deps = [
+        ":mpi_message_proto_cc",
+        "//third_party/mpi",
+    ],
+)
+
+tf_py_test(
+    name = "mpi_ops_test",
+    srcs = ["mpi_ops_test.py"],
+    additional_deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:platform",
+    ],
+    data = [
+        ":mpi_collectives.so",
+    ],
+    tags = ["manual"],
+)
+
+py_library(
+    name = "mpi_ops_py",
+    srcs = [
+        "__init__.py",
+        "mpi_ops.py",
+    ],
+    data = [
+        ":mpi_collectives.so",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/contrib/mpi_collectives/README.md b/tensorflow/contrib/mpi_collectives/README.md
new file mode 100644
index 0000000000..c5e1a8c37e
--- /dev/null
+++ b/tensorflow/contrib/mpi_collectives/README.md
@@ -0,0 +1,5 @@
+# MPI TensorFlow integration
+
+Tensorflow MPI integration allows communicating between different TensorFlow
+processes using MPI. This enables training across multiple nodes and GPUs
+using high-speed interconnects.
diff --git a/tensorflow/contrib/mpi_collectives/__init__.py b/tensorflow/contrib/mpi_collectives/__init__.py
new file mode 100644
index 0000000000..b94f7b0a35
--- /dev/null
+++ b/tensorflow/contrib/mpi_collectives/__init__.py
@@ -0,0 +1,273 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=g-short-docstring-punctuation
+"""## Communicating Between Processes with MPI
+
+TensorFlow natively provides inter-device communication through send and
+receive ops and inter-node communication through Distributed TensorFlow, based
+on the same send and receive abstractions. On HPC clusters where Infiniband or
+other high-speed node interconnects are available, these can end up being
+insufficient for synchronous data-parallel training (without asynchronous
+gradient descent). This module implements a variety of MPI ops which can take
+advantage of hardware-specific MPI libraries for efficient communication.
+
+In order to use this module, TensorFlow must be built with an MPI library,
+which can be provided to the `./configure` script at build time. As a user of
+TensorFlow, you will need to build TensorFlow yourself to select the MPI
+library to use; to do so, follow the [instructions for building TensorFlow from
+source](https://www.tensorflow.org/get_started/os_setup#installing_from_sources).
+
+### Utility Ops
+
+In addition to reductions and gathers, this module provides utility operations
+for detecting the running MPI configuration.
+
+Example:
+
+```python
+from tensorflow.contrib import mpi
+
+# Use `mpi.Session` instead of `tf.Session`
+with mpi.Session() as session:
+    rank = session.run(mpi.rank())
+    print("My MPI Rank:", rank)
+
+    if rank == 0:
+        print("MPI Size:", session.run(mpi.size()))
+```
+
+@@rank
+@@size
+
+### Ring Allreduce and Allgather
+
+When summing or averaging tensors across many processes, communication can
+easily become a bottleneck. A naive implementation will send all the tensor
+values to the same process, perform the reduction, and then broadcast the
+values back to all other processes, effectively creating a synchronous
+parameter server in one process. However, the process responsible for
+performing the reduction will have to receive and send a massive amount of data
+which scales with the number of processes *and* the number of parameters in the
+model.
+
+Instead of centralizing the reduction and having one primary reducer, we can
+implement a distributed allreduce or allgather. A bandwidth-optimal allreduce
+will end up sending 2(N - 1) values for every value in the input tensor,
+and can be implemented with a ring allreduce [1]. (Intuitively, a linear reduce
+requires at least (N - 1) sends between the different nodes, and a broadcast of
+the result also requires (N - 1) sends, for a total of 2 (N - 1); these two
+steps cannot be combined in a clever way to reduce the number of required
+sends.) This module implements bandwidth-optimal ring allreduce and ring
+allgather operations using MPI; by choosing a hardware-appropriate MPI
+implementation (such as OpenMPI with CUDA-IPC support), you can train large
+models with synchronous gradient descent with minimal communication overhead.
+
+In addition to the `allreduce` and `allgather` functions, a convenience
+`DistributedOptimizer` wrapper is provided to simplify using these functions
+for reducing model gradients.
+
+Example:
+
+```python
+import tensorflow as tf
+from tensorflow.contrib import mpi_collectives as mpi
+
+# Construct a simple linear regression model to optimize
+W = tf.get_variable("W", shape=[20, 1], dtype=tf.float32)
+B = tf.get_variable("B", shape=[1, 1], dtype=tf.float32)
+inputs = tf.placeholder("Inputs", shape=[None, 20])
+outputs = tf.placeholder("Outputs", shape=[None, 1])
+loss = tf.nn.l2_loss(tf.matmul(inputs, W) + B - outputs)
+
+# Training using MPI allreduce with DistributedOptimizer
+optimizer = mpi.DistributedOptimizer(tf.train.AdamOptimizer())
+train = optimizer.minimize(loss)
+
+# Average loss over all ranks, for printing.
+# Do not pass this to an optimizer!
+avg_loss = mpi.allreduce(loss)
+
+# On different ranks, feed different input data.
+with mpi.Session() as session:
+    rank = session.run(mpi.rank())
+    batch_inputs, batch_outputs = construct_batch_for_rank(rank)
+    feed_dict = {inputs: batch_inputs, outputs: batch_outputs}
+    _, l = session.run([train, avg_loss], feed_dict=feed_dict)
+    print("Average Loss:", l)
+```
+
+[1] Patarasuk, Pitch and Yuan, Xin. "Bandwidth Optimal All-reduce Algorithms
+for Clusters of Workstations".
+
+@@Session
+@@DistributedOptimizer
+@@allreduce
+@@allgather
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.contrib.mpi_collectives.mpi_ops import size
+from tensorflow.contrib.mpi_collectives.mpi_ops import rank
+from tensorflow.contrib.mpi_collectives.mpi_ops import local_rank
+from tensorflow.contrib.mpi_collectives.mpi_ops import allgather
+from tensorflow.contrib.mpi_collectives.mpi_ops import _allreduce
+from tensorflow.contrib.mpi_collectives.mpi_ops import init
+
+
+def allreduce(tensor, average=True):
+  """Perform an MPI allreduce on a tf.Tensor or tf.IndexedSlices.
+
+  Arguments:
+  tensor: tf.Tensor, tf.Variable, or tf.IndexedSlices to reduce.
+          The shape of the input must be identical across all ranks.
+  average: If True, computes the average over all ranks.
+           Otherwise, computes the sum over all ranks.
+
+  This function performs a bandwidth-optimal ring allreduce on the input
+  tensor. If the input is an tf.IndexedSlices, the function instead does an
+  allgather on the values and the indices, effectively doing an allreduce on
+  the represented tensor.
+  """
+  if isinstance(tensor, tf.IndexedSlices):
+    # For IndexedSlices, do two allgathers intead of an allreduce.
+    mpi_size = tf.cast(size(), tensor.values.dtype)
+    values = allgather(tensor.values)
+    indices = allgather(tensor.indices)
+
+    # To make this operation into an average, divide all gathered values by
+    # the MPI size.
+    new_values = tf.div(values, mpi_size) if average else values
+    return tf.IndexedSlices(new_values, indices,
+                            dense_shape=tensor.dense_shape)
+  else:
+    mpi_size = tf.cast(size(), tensor.dtype)
+    summed_tensor = _allreduce(tensor)
+    new_tensor = (tf.div(summed_tensor, mpi_size)
+                  if average else summed_tensor)
+    return new_tensor
+
+
+class DistributedOptimizer(tf.train.Optimizer):
+  """An optimizer that wraps another tf.Optimizer, using an MPI allreduce to
+  average gradient values before applying gradients to model weights."""
+
+  def __init__(self, optimizer, name=None, use_locking=False):
+    """Construct a new DistributedOptimizer, which uses another optimizer
+    under the hood for computing single-process gradient values and
+    applying gradient updates after the gradient values have been averaged
+    across all the MPI ranks.
+
+    Args:
+    optimizer: Optimizer to use for computing gradients and applying updates.
+    name: Optional name prefix for the operations created when applying
+          gradients. Defaults to "Distributed" followed by the provided
+          optimizer type.
+    use_locking: Whether to use locking when updating variables. See
+                 Optimizer.__init__ for more info.
+    """
+    if name is None:
+      name = "Distributed{}".format(type(optimizer).__name__)
+
+    self._optimizer = optimizer
+    super(DistributedOptimizer, self).__init__(
+        name=name, use_locking=use_locking)
+
+  def compute_gradients(self, *args, **kwargs):
+    """Compute gradients of all trainable variables.
+
+    See Optimizer.compute_gradients() for more info.
+
+    In DistributedOptimizer, compute_gradients() is overriden to also
+    allreduce the gradients before returning them.
+    """
+    gradients = (super(DistributedOptimizer, self)
+                 .compute_gradients(*args, **kwargs))
+    return [(allreduce(gradient), var) for (gradient, var) in gradients]
+
+  def _apply_dense(self, *args, **kwargs):
+    """Calls this same method on the underlying optimizer."""
+    return self._optimizer._apply_dense(*args, **kwargs)
+
+  def _apply_sparse(self, *args, **kwargs):
+    """Calls this same method on the underlying optimizer."""
+    return self._optimizer._apply_sparse(*args, **kwargs)
+
+  def _apply_sparse_duplicate_indices(self, *args, **kwargs):
+    """Calls this same method on the underlying optimizer."""
+    return self._optimizer._apply_sparse_duplicate_indices(*args,
+                                                           **kwargs)
+
+  def _prepare(self, *args, **kwargs):
+    """Calls this same method on the underlying optimizer."""
+    return self._optimizer._prepare(*args, **kwargs)
+
+  def _create_slots(self, *args, **kwargs):
+    """Calls this same method on the underlying optimizer."""
+    return self._optimizer._create_slots(*args, **kwargs)
+
+  def _valid_dtypes(self, *args, **kwargs):
+    """Calls this same method on the underlying optimizer."""
+    return self._optimizer._valid_dtypes(*args, **kwargs)
+
+  def _finish(self, *args, **kwargs):
+    """Calls this same method on the underlying optimizer."""
+    return self._optimizer._finish(*args, **kwargs)
+
+
+class Session(tf.Session):
+  """A class for running TensorFlow operations, with copies of the same graph
+  running distributed across different MPI nodes.
+
+  The primary difference between `tf.Session` and
+  `tf.contrib.mpi_collectives.Session` is that the MPI `Session` ensures that
+  the `Session` options are correct for use with `tf.contrib.mpi`, and
+  initializes MPI immediately upon the start of the session.
+  """
+
+  def __init__(self, target='', graph=None, config=None):
+    """Creates a new TensorFlow MPI session.
+
+    Unlike a normal `tf.Session`, an MPI Session may only use a single GPU,
+    which must be specified in advance before the session is initialized.
+    In addition, it only uses a single graph evaluation thread, and
+    initializes MPI immediately upon starting.
+
+    If no `graph` argument is specified when constructing the session,
+    the default graph will be launched in the session. If you are
+    using more than one graph (created with `tf.Graph()` in the same
+    process, you will have to use different sessions for each graph,
+    but each graph can be used in multiple sessions. In this case, it
+    is often clearer to pass the graph to be launched explicitly to
+    the session constructor.
+
+    Args:
+    target: (Optional.) The execution engine to connect to.
+    graph: (Optional.) The `Graph` to be launched (described above).
+    config: (Optional.) A `ConfigProto` protocol buffer with configuration
+    options for the session.
+    """
+    super(Session, self).__init__(target, graph, config=config)
+
+    # Initialize MPI on the relevant device.
+    # TODO: Move this to library load and eliminate mpi.Session()
+    if graph is None:
+      graph = tf.get_default_graph()
+    with graph.as_default():
+      self.run(init())
diff --git a/tensorflow/contrib/mpi_collectives/mpi_allgather_test.py b/tensorflow/contrib/mpi_collectives/mpi_allgather_test.py
new file mode 100644
index 0000000000..c23dd33d57
--- /dev/null
+++ b/tensorflow/contrib/mpi_collectives/mpi_allgather_test.py
@@ -0,0 +1,114 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+import tensorflow as tf
+import tensorflow.contrib.mpi_collectives as mpi
+from tensorflow.python.platform import test
+
+
+average_allgather = False
+
+
+class AllgatherTest(test.TestCase):
+  def checkAllgather(self, num_ranks, all_gathered, local_gathered):
+    # Ensure that indices match.
+    all_gat_ind = np.sort(all_gathered.indices)
+    loc_gat_ind = np.sort(local_gathered.indices)
+    assert(len(loc_gat_ind) == len(all_gat_ind))
+    for i in range(len(loc_gat_ind)):
+      assert(loc_gat_ind[i] == all_gat_ind[i])
+
+    # For each index, verify same values.
+    local_checked = []
+    for i in range(len(local_gathered.indices)):
+      local_checked.append(False)
+    for i in range(len(all_gathered.indices)):
+      all_index = all_gathered.indices[i]
+      # TODO(jthestness): Make this lookup quicker using sorting.
+      loc_index = -1
+      for j in range(len(local_gathered.indices)):
+        if local_gathered.indices[j] == all_index and not local_checked[j]:
+          loc_index = j
+          local_checked[j] = True
+          break
+      assert(loc_index >= 0)
+      correct_output = local_gathered.values[loc_index][0]
+      if average_allgather:
+        correct_output = correct_output / float(num_ranks)
+      assert(all_gathered.values[i][0] == correct_output)
+
+
+  def test_mpi_allgather(self):
+    # Get MPI rank
+    my_rank = int(os.environ['PMI_RANK'])
+    num_ranks = int(os.environ['PMI_SIZE'])
+
+    indices_per_rank = 100
+    tensor_width = 10
+
+    # Create IndexedSlices for each rank, some with overlapping indices.
+    to_gather_indices = []
+    to_gather_values = []
+    to_gather = []
+    for rank_id in range(num_ranks):
+      indices = []
+      values = []
+      my_multiple = rank_id + 1
+      current_index = my_multiple
+      for i in range(indices_per_rank):
+        indices.append(current_index)
+        ones_tensor = tf.ones([tensor_width])
+        values.append(tf.multiply(ones_tensor,
+                                  tf.fill(ones_tensor.get_shape(),
+                                          float(current_index))))
+        current_index += my_multiple
+      concat_ind = tf.stack(indices)
+      concat_vals = tf.stack(values)
+      to_gather_indices.append(concat_ind)
+      to_gather_values.append(concat_vals)
+      to_gather.append(tf.IndexedSlices(concat_vals, concat_ind))
+
+    # Collect the local IndexedSlices (indices and values) to create
+    # correct IndexedSlices output.
+    correct_gather_indices = tf.concat(to_gather_indices, 0)
+    correct_gather_values = tf.concat(to_gather_values, 0)
+    correct_gather = tf.IndexedSlices(correct_gather_values,
+                                      correct_gather_indices)
+
+    all_gather = mpi.allreduce(to_gather[my_rank], average_allgather)
+
+    # NOTE: This assumes that device IDs are numbered the same as ranks.
+    gpu_options = tf.GPUOptions(visible_device_list=str(my_rank))
+    config = tf.ConfigProto(gpu_options=gpu_options)
+
+    # MPI Session to test allgather.
+    with mpi.Session(config=config) as sess:
+      sess.run(tf.global_variables_initializer())
+
+      all_gathered, local_gathered = sess.run([all_gather, correct_gather])
+
+      # Compare all_gathered with local_gathered.
+      self.checkAllgather(num_ranks, all_gathered, local_gathered)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/mpi_collectives/mpi_allreduce_test.py b/tensorflow/contrib/mpi_collectives/mpi_allreduce_test.py
new file mode 100644
index 0000000000..001f9170bc
--- /dev/null
+++ b/tensorflow/contrib/mpi_collectives/mpi_allreduce_test.py
@@ -0,0 +1,153 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+import tensorflow as tf
+import tensorflow.contrib.mpi_collectives as mpi
+from tensorflow.python.platform import test
+
+
+average_allreduce = False
+max_wrong_count = -1
+
+
+class AllreduceTest(test.TestCase):
+  def dumpFailure(self, my_rank, out_loc_red, my_correct, out_all_red,
+                  our_correct):
+    # Find reduced/allreduced indices that are wrong and print all the
+    # values from output, slices, reduced, allreduced, so we can debug
+    # which is incorrect:
+    wrong_count = 0
+    red_dims = out_loc_red.shape
+    assert(len(red_dims) == 2)
+    for i in range(red_dims[0]):
+      for j in range(red_dims[1]):
+        suffix = ""
+        if out_loc_red[i][j] != my_correct[i][j] or \
+           out_all_red[i][j] != our_correct[i][j]:
+          suffix = "WRONG"
+          wrong_count += 1
+        print("{}\t{}\t{}\t{}\t{}\t{}"
+              .format(my_rank, i, j, out_loc_red[i][j],
+                      out_all_red[i][j], suffix), flush=True)
+        if max_wrong_count > 0 and wrong_count >= max_wrong_count:
+          return
+
+  def test_mpi_allreduce(self):
+    # Get MPI rank
+    my_rank = int(os.environ['PMI_RANK'])
+    num_ranks = int(os.environ['PMI_SIZE'])
+
+    stages = 13
+    batch_size = 1331
+    hidden_size = batch_size
+    out_size = batch_size
+
+    # Input placeholder (batch_size x hidden) - init to 1s
+    inputs = tf.placeholder(tf.float32, shape=(batch_size, hidden_size),
+                            name="Input")
+
+    # Large matrices (hidden x out_dim) - init random
+    weights = []
+    for i in range(stages):
+      initer = tf.constant_initializer(pow(2.0, i + 1.0))
+      weights.append(tf.get_variable("weights_{}".format(i),
+                                     shape=(hidden_size, out_size),
+                                     dtype=tf.float32,
+                                     initializer=initer))
+
+    # Calculate output through dependent allreduces
+    stage_input = inputs
+    for i in range(stages):
+      inter_output = tf.add(stage_input, weights[i],
+                            name="add_red_{}".format(i))
+      stage_input = mpi.allreduce(inter_output,
+                                  average=average_allreduce)
+
+    all_reduced = stage_input
+
+    # Local reduced output for verification
+    local_input = inputs
+    for i in range(stages):
+      inter_output = tf.add(local_input, weights[i],
+                            name="addin_loc_{}".format(i))
+      my_reducer = tf.Variable(initial_value=np.ones((hidden_size, out_size)),
+                               dtype=tf.float32, name="loc_redr_{}".format(i))
+      for r in range(num_ranks):
+        my_reducer = tf.add(my_reducer, inter_output,
+                            name="add_loc_{}_{}".format(i, r))
+      if average_allreduce:
+        local_input = tf.div(my_reducer, num_ranks,
+                             name="div_loc_{}".format(i))
+      else:
+        local_input = my_reducer
+
+    local_reduced = local_input
+
+    # NOTE: This assumes that device IDs are numbered the same as ranks
+    gpu_options = tf.GPUOptions(visible_device_list=str(my_rank))
+    config = tf.ConfigProto(gpu_options=gpu_options)
+
+    # MPI Session to test allreduce
+    with mpi.Session(config=config) as sess:
+      sess.run(tf.global_variables_initializer())
+
+      input_feed = np.ones((batch_size, hidden_size), dtype=np.float32)
+      our_output = input_feed[0][0]
+      spread_var = 100
+      input_feed = input_feed + my_rank * spread_var
+      my_output = input_feed[0][0]
+      for i in range(stages):
+        curr_feed = my_output + pow(2.0, i + 1.0)
+        my_output = curr_feed * num_ranks + 1
+        curr_our_feed = our_output + pow(2.0, i + 1.0)
+        if i == 0:
+          sum_ranks = num_ranks * (num_ranks - 1) / 2
+          our_output = curr_our_feed * num_ranks + \
+            spread_var * sum_ranks
+        else:
+          our_output = curr_our_feed * num_ranks
+
+      print("rank {}: My output is {}".format(my_rank, my_output))
+      my_correct = np.zeros((batch_size, hidden_size), dtype=np.float32)
+      my_correct = my_correct + my_output
+      print("rank {}: Our output is {}".format(my_rank, our_output))
+      our_correct = np.zeros((batch_size, hidden_size), dtype=np.float32)
+      our_correct = our_correct + our_output
+
+      for i in range(1000):
+        if i % 100 == 0:
+          print("{}: iter {}".format(my_rank, i), flush=True)
+        feed_dict = {inputs: input_feed}
+        out_all_red, out_loc_red \
+          = sess.run([all_reduced, local_reduced],
+                     feed_dict=feed_dict)
+
+        if not np.allclose(out_loc_red, my_correct) or \
+           not np.allclose(out_all_red, our_correct):
+          print("Test incorrect on iter {}".format(i), flush=True)
+          self.dumpFailure(my_rank, out_loc_red, my_correct, out_all_red,
+                           our_correct)
+          assert(np.allclose(out_loc_red, my_correct) and
+                 np.allclose(out_all_red, our_correct))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/mpi_collectives/mpi_message.proto b/tensorflow/contrib/mpi_collectives/mpi_message.proto
new file mode 100644
index 0000000000..7fa5e20301
--- /dev/null
+++ b/tensorflow/contrib/mpi_collectives/mpi_message.proto
@@ -0,0 +1,64 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package tensorflow.contrib.mpi;
+
+import "tensorflow/core/framework/tensor_shape.proto";
+import "tensorflow/core/framework/types.proto";
+
+// An MPIRequest is a message sent from a rank greater than zero to the
+// coordinator (rank zero), informing the coordinator of an operation that
+// the rank wants to do and the tensor that it wants to apply the operation to.
+message MPIRequest {
+  enum RequestType {
+    ALLREDUCE = 0;
+    ALLGATHER = 1;
+  }
+
+  // The request rank is necessary to create a consistent ordering of results,
+  // for example in the allgather where the order of outputs should be sorted
+  // by rank.
+  int32 request_rank = 1;
+  RequestType request_type = 2;
+  DataType tensor_type = 3;
+  string tensor_name = 4;
+  TensorShapeProto tensor_shape = 5;
+};
+
+// An MPIResponse is a message sent from the coordinator (rank zero) to a rank
+// greater than zero, informing the rank of an operation should be performed
+// now. If the operation requested would result in an error (for example, due
+// to a type or shape mismatch), then the MPIResponse can contain an error and
+// an error message instead. Finally, an MPIResponse can be a DONE message (if
+// there are no more tensors to reduce on this tick of the background loop) or
+// SHUTDOWN if all MPI processes should shut down.
+message MPIResponse {
+  enum ResponseType {
+    ALLREDUCE = 0;
+    ALLGATHER = 1;
+    ERROR = 2;
+    DONE = 3;
+    SHUTDOWN = 4;
+  }
+
+  // Empty if the type is DONE or SHUTDOWN.
+  ResponseType response_type = 1;
+  string tensor_name = 2;
+
+  // Empty unless response_type is ERROR.
+  string error_message = 3;
+};
diff --git a/tensorflow/contrib/mpi_collectives/mpi_ops.cc b/tensorflow/contrib/mpi_collectives/mpi_ops.cc
new file mode 100644
index 0000000000..a051ab0004
--- /dev/null
+++ b/tensorflow/contrib/mpi_collectives/mpi_ops.cc
@@ -0,0 +1,1236 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef TENSORFLOW_USE_MPI
+
+#include <queue>
+#include <thread>
+#include <unordered_map>
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/mutex.h"
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#include <cuda_runtime.h>
+#include "tensorflow/stream_executor/stream.h"
+#endif
+
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+#define OMPI_SKIP_MPICXX
+#include "third_party/mpi/mpi.h"
+#include "tensorflow/contrib/mpi_collectives/mpi_message.pb.h"
+#include "tensorflow/contrib/mpi_collectives/ring.h"
+
+/*
+ * MPI Allreduce and Allgather Ops for TensorFlow.
+ *
+ * TensorFlow natively provides inter-device communication through send and
+ * receive ops and inter-node communication through Distributed TensorFlow,
+ * based on the same send and receive abstractions. These end up being
+ * insufficient for synchronous data-parallel training on HPC clusters where
+ * Infiniband or other high-speed interconnects are available.  This module
+ * implements MPI ops for allgather and allreduce, which do bandwidth-optimal
+ * gathers and reductions and can take advantage of hardware-optimized
+ * communication libraries through the MPI implementation.
+ *
+ * The primary logic of the allreduce and allgather are in RingAllgather() and
+ * RingAllreduce(). The background thread which facilitates MPI operations is
+ * run in BackgroundThreadLoop(). The provided MPI ops are:
+ *      – MPIInit:
+ *          Initialize MPI on a given device (CPU or GPU).
+ *          Should only be run on a single device in every process.
+ *      – MPISize:
+ *          Get the number of MPI processes in the global communicator.
+ *      – MPIRank:
+ *          Get the rank of the current MPI process in the global communicator.
+ *      – MPILocalRank:
+ *          Get the local rank of the current MPI process within its node.
+ *      – MPIAllreduce:
+ *          Perform an allreduce on a Tensor, returning the sum
+ *          across all MPI processes in the global communicator.
+ *      – MPIAllgather:
+ *          Perform an allgather on a Tensor, returning the concatenation of
+ *          the tensor on the first dimension across all MPI processes in the
+ *          global communicator.
+ *
+ */
+
+template <class T>
+using StatusOr = perftools::gputools::port::StatusOr<T>;
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+namespace tensorflow {
+namespace contrib {
+namespace mpi {
+
+// Make sure template specializations are generated in the ring.cu.cc and the
+// ring.cc file, not in this file.
+extern template Status RingAllreduce<GPUDevice, int>(OpKernelContext*,
+                                                     const Tensor*, Tensor*,
+                                                     Tensor*);
+extern template Status RingAllreduce<GPUDevice, long long>(OpKernelContext*,
+                                                           const Tensor*,
+                                                           Tensor*, Tensor*);
+extern template Status RingAllreduce<GPUDevice, float>(OpKernelContext*,
+                                                       const Tensor*, Tensor*,
+                                                       Tensor*);
+extern template Status RingAllgather<GPUDevice, int>(OpKernelContext*,
+                                                     const Tensor*,
+                                                     const std::vector<size_t>&,
+                                                     Tensor*);
+extern template Status RingAllgather<GPUDevice, long long>(
+    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
+extern template Status RingAllgather<GPUDevice, float>(
+    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
+extern template Status RingAllreduce<CPUDevice, int>(OpKernelContext*,
+                                                     const Tensor*, Tensor*,
+                                                     Tensor*);
+extern template Status RingAllreduce<CPUDevice, long long>(OpKernelContext*,
+                                                           const Tensor*,
+                                                           Tensor*, Tensor*);
+extern template Status RingAllreduce<CPUDevice, float>(OpKernelContext*,
+                                                       const Tensor*, Tensor*,
+                                                       Tensor*);
+extern template Status RingAllgather<CPUDevice, int>(OpKernelContext*,
+                                                     const Tensor*,
+                                                     const std::vector<size_t>&,
+                                                     Tensor*);
+extern template Status RingAllgather<CPUDevice, long long>(
+    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
+extern template Status RingAllgather<CPUDevice, float>(
+    OpKernelContext*, const Tensor*, const std::vector<size_t>&, Tensor*);
+
+namespace {
+
+// Return true if the templated type is GPUDevice, otherwise false.
+template <typename T>
+bool IsGPUDevice();
+template <>
+bool IsGPUDevice<GPUDevice>() {
+  return true;
+};
+template <>
+bool IsGPUDevice<CPUDevice>() {
+  return false;
+};
+
+// A callback to call after the MPI communication completes. Since the
+// allreduce and allgather ops are asynchronous, this callback is what resumes
+// computation after the reduction is completed.
+typedef std::function<void(StatusOr<Tensor>)> CommunicationDoneCallback;
+
+struct CollectiveOpRecord {
+  // The rank performing this piece of the op
+  int rank;
+
+  // The name of the op/tensor to be reduced
+  std::string name;
+
+  // The op's kernel context
+  OpKernelContext* context;
+
+  // Data type of the op
+  DataType dtype;
+
+  // The input tensor
+  const Tensor* in_t;
+
+  // Allgather: Vector of per-rank first-dimension sizes
+  std::vector<size_t> sizes_vec;
+
+  // The temp tensor for intermediate results
+  Tensor temp_t;
+
+  // The output tensor
+  Tensor* out_t;
+
+  // Whether to run this op on the gpu
+  bool on_gpu;
+
+  // The callback to call after the op has completed
+  CommunicationDoneCallback callback;
+};
+
+// Table storing Tensors to be reduced, keyed by unique name.
+// This table contains everything necessary to do the reduction
+typedef std::unordered_map<std::string, CollectiveOpRecord> TensorTable;
+
+// Table for storing Tensor metadata on rank zero. This is used for error
+// checking and size calculations, as well as determining when a reduction is
+// ready to be done (when all nodes are ready to do it).
+typedef std::unordered_map<std::string, std::vector<MPIRequest> > MessageTable;
+
+// The global state required for the MPI ops.
+//
+// MPI is a library that stores a lot of global per-program state and often
+// requires running on a single thread. As a result, we have to have a single
+// background thread responsible for all MPI operations, and communicate with
+// that background thread through global state.
+struct MPIGlobalState {
+  // An atomic boolean which is set to true when MPI is initialized.
+  // This ensures that MPI_Init is never called twice.
+  std::atomic_flag initialized_flag = ATOMIC_FLAG_INIT;
+
+  // Condition variable to wait for initialization
+  condition_variable cv;
+
+  // Whether MPI_Init has been completed on the background thread.
+  bool initialization_done = false;
+
+  // Whether MPI_Init succeeded on the background thread.
+  Status init_status;
+
+  // A mutex that needs to be used whenever MPI operations touch
+  // shared structures.
+  mutex mu;
+
+  // Tensors waiting to be allreduced or allgathered.
+  TensorTable tensor_table;
+
+  // Queue of MPI requests waiting to be sent to the coordinator node.
+  std::queue<MPIRequest> message_queue;
+
+  // Background thread running MPI communication.
+  std::thread background_thread;
+
+  // Whether the background thread should shutdown.
+  bool shut_down = false;
+
+  // Only exists on the coordinator node (rank zero). Maintains a count of
+  // how many nodes are ready to allreduce every tensor (keyed by tensor
+  // name).
+  std::unique_ptr<MessageTable> message_table;
+
+  // The MPI rank, local rank, and size.
+  int rank = 0;
+  int local_rank = 0;
+  int size = 1;
+
+  // The device that MPI was initialized on. (-1 for no GPU)
+  int device = -1;
+
+  // The CUDA stream used for data transfers and within-allreduce operations.
+  // A naive implementation would use the TensorFlow StreamExecutor CUDA
+  // stream. However, the allreduce and allgather require doing memory copies
+  // and kernel executions (for accumulation of values on the GPU). However,
+  // the subsequent operations must wait for those operations to complete,
+  // otherwise MPI (which uses its own stream internally) will begin the data
+  // transfers before the CUDA calls are complete. In order to wait for those
+  // CUDA operations, if we were using the TensorFlow stream, we would have
+  // to synchronize that stream; however, other TensorFlow threads may be
+  // submitting more work to that stream, so synchronizing on it can cause
+  // the allreduce to be delayed, waiting for compute totally unrelated to it
+  // in other parts of the graph. Overlaying memory transfers and compute
+  // during backpropagation is crucial for good performance, so we cannot use
+  // the TensorFlow stream, and must use our own stream.
+#if GOOGLE_CUDA
+  cudaStream_t stream;
+  std::atomic_flag stream_created_flag = ATOMIC_FLAG_INIT;
+#endif
+
+  ~MPIGlobalState() {
+    // Make sure that the destructor of the background thread is safe to
+    // call. If a thread is still joinable (not detached or complete) its
+    // destructor cannot be called.
+    if (background_thread.joinable()) {
+      shut_down = true;
+      background_thread.join();
+    }
+  }
+};
+
+// All the MPI state that must be stored globally per-process.
+static MPIGlobalState mpi_global;
+
+// For clarify in argument lists.
+#define RANK_ZERO 0
+
+// A tag used for all coordinator messaging.
+#define TAG_NOTIFY 1
+
+// Store the MPIRequest for a name, and return whether the total count of
+// MPIRequests for that tensor is now equal to the MPI size (and thus we are
+// ready to reduce the tensor).
+bool IncrementTensorCount(std::unique_ptr<MessageTable>& message_table,
+                          MPIRequest msg, int mpi_size) {
+  auto name = msg.tensor_name();
+  auto table_iter = message_table->find(name);
+  if (table_iter == message_table->end()) {
+    message_table->emplace(name, std::vector<MPIRequest>({msg}));
+    table_iter = message_table->find(name);
+  } else {
+    table_iter->second.push_back(msg);
+  }
+
+  int count = table_iter->second.size();
+  return count == mpi_size;
+}
+
+// Once a tensor is ready to be reduced, the coordinator sends an MPIResponse
+// instructing all ranks to start the reduction to all ranks. The MPIResponse
+// also contains error messages in case the submitted MPIRequests were not
+// valid (for example, contained mismatched shapes or types).
+//
+// Constructing the MPIResponse, thus, requires a whole lot of error checking.
+MPIResponse ConstructMPIResponse(std::unique_ptr<MessageTable>& message_table,
+                                 std::string name) {
+  bool error = false;
+  auto it = message_table->find(name);
+  assert(it != message_table->end());
+
+  std::vector<MPIRequest> requests = it->second;
+  assert(requests.size() > 0);
+
+  std::ostringstream error_message_stream;
+
+  // Check that all data types being reduced or gathered are identical
+  auto data_type = requests[0].tensor_type();
+  for (unsigned int i = 1; i < requests.size(); i++) {
+    auto request_type = requests[i].tensor_type();
+    if (data_type != request_type) {
+      error = true;
+      error_message_stream << "Mismatched data types: One rank had type "
+                           << DataType_Name(data_type)
+                           << ", but another rank had type "
+                           << DataType_Name(request_type) << ".";
+      break;
+    }
+  }
+
+  // Check that all requested operations are the same
+  auto message_type = requests[0].request_type();
+  for (unsigned int i = 1; i < requests.size(); i++) {
+    if (error) {
+      break;
+    }
+
+    auto request_type = requests[i].request_type();
+    if (message_type != request_type) {
+      error = true;
+      error_message_stream << "Mismatched MPI operations: One rank did an "
+                           << message_type << ", but another rank did an "
+                           << request_type << ".";
+      break;
+    }
+  }
+
+  // If we are doing an allreduce, check that all tensor shapes
+  // are identical
+  if (message_type == MPIRequest::ALLREDUCE) {
+    TensorShape tensor_shape = requests[0].tensor_shape();
+    for (unsigned int i = 1; i < requests.size(); i++) {
+      if (error) {
+        break;
+      }
+
+      TensorShape request_shape = requests[i].tensor_shape();
+      if (tensor_shape != request_shape) {
+        error = true;
+        error_message_stream << "Mismatched allreduce tensor shapes: "
+                             << "One rank reduced a tensor of shape "
+                             << tensor_shape.DebugString()
+                             << ", but another rank sent a tensor of shape "
+                             << request_shape.DebugString() << ".";
+        break;
+      }
+    }
+  }
+
+  // If we are doing an allgather, make sure all but the first dimension are
+  // the same. The first dimension may be different and the output tensor is
+  // the sum of the first dimension. Collect the sizes by rank.
+  if (message_type == MPIRequest::ALLGATHER) {
+    TensorShape tensor_shape = requests[0].tensor_shape();
+
+    if (tensor_shape.dims() == 0) {
+      error = true;
+      error_message_stream << "Rank zero tried to gather a rank-zero tensor.";
+    }
+
+    for (unsigned int i = 1; i < requests.size(); i++) {
+      if (error) {
+        break;
+      }
+
+      TensorShape request_shape = requests[i].tensor_shape();
+      if (tensor_shape.dims() != request_shape.dims()) {
+        error = true;
+        error_message_stream << "Mismatched allgather tensor shapes: "
+                             << "One rank gathered a tensor of rank "
+                             << tensor_shape.dims()
+                             << ", but another rank sent a tensor of rank "
+                             << request_shape.dims() << ".";
+        break;
+      }
+
+      for (unsigned int dim = 1; dim < tensor_shape.dims(); dim++) {
+        if (tensor_shape.dim_size(dim) != request_shape.dim_size(dim)) {
+          error = true;
+          error_message_stream
+              << "Mismatched allgather tensor shapes: "
+              << "One rank gathered a tensor with dimension " << dim
+              << " equal to " << tensor_shape.dim_size(dim)
+              << ", but another rank sent a tensor with dimension " << dim
+              << " equal to " << request_shape.dim_size(dim) << ".";
+          break;
+        }
+      }
+    }
+  }
+
+  MPIResponse response;
+  response.set_tensor_name(name);
+  if (error) {
+    std::string error_message = error_message_stream.str();
+    response.set_response_type(MPIResponse::ERROR);
+    response.set_error_message(error_message);
+  } else {
+    auto response_type = MPIResponse::ERROR;
+    if (message_type == MPIRequest::ALLREDUCE) {
+      response_type = MPIResponse::ALLREDUCE;
+    } else {
+      response_type = MPIResponse::ALLGATHER;
+    }
+    response.set_response_type(response_type);
+  }
+
+  // Clear all queued up requests for this name. They are now taken care of
+  // by the constructed MPI response.
+  message_table->erase(it);
+
+  return response;
+}
+
+// Process an MPIResponse by doing a reduction, a gather, or raising an error.
+void PerformCollectiveOp(TensorTable& tensor_table, MPIResponse response) {
+  OpKernelContext* context;
+  const Tensor* input_tensor;
+  std::vector<size_t> sizes_vec;
+  Tensor temp_tensor;
+  Tensor* output_tensor;
+  CommunicationDoneCallback callback;
+  bool on_gpu;
+  {
+    // Lock on the tensor table.
+    mutex_lock guard(mpi_global.mu);
+
+    // We should never fail at finding this key in the tensor table.
+    auto name = response.tensor_name();
+    auto iter = tensor_table.find(name);
+    assert(iter != tensor_table.end());
+
+    assert(response.response_type() == MPIResponse::ALLREDUCE ||
+           response.response_type() == MPIResponse::ALLGATHER ||
+           response.response_type() == MPIResponse::ERROR);
+
+    CollectiveOpRecord record = iter->second;
+    context = record.context;
+    input_tensor = record.in_t;
+    sizes_vec = record.sizes_vec;
+    temp_tensor = record.temp_t;
+    output_tensor = record.out_t;
+    on_gpu = record.on_gpu;
+    callback = record.callback;
+
+    // Clear the tensor table of this tensor and its callbacks; the rest of
+    // this function takes care of it.
+    tensor_table.erase(iter);
+  }
+
+  // Use CPUDevice instead of GPUDevice if no CUDA, to ensure we don't
+  // link to non-existent symbols.
+#if GOOGLE_CUDA
+#define GPU_DEVICE_IF_CUDA GPUDevice
+#else
+#define GPU_DEVICE_IF_CUDA CPUDevice
+#endif
+
+  Status status;
+  auto dtype = input_tensor->dtype();
+  if (response.response_type() == MPIResponse::ALLGATHER) {
+    if (dtype == DT_FLOAT) {
+      status = on_gpu ? RingAllgather<GPU_DEVICE_IF_CUDA, float>(
+                            context, input_tensor, sizes_vec, output_tensor)
+                      : RingAllgather<CPUDevice, float>(
+                            context, input_tensor, sizes_vec, output_tensor);
+    } else if (dtype == DT_INT32) {
+      status = on_gpu ? RingAllgather<GPU_DEVICE_IF_CUDA, int>(
+                            context, input_tensor, sizes_vec, output_tensor)
+                      : RingAllgather<CPUDevice, int>(context, input_tensor,
+                                                      sizes_vec, output_tensor);
+    } else if (dtype == DT_INT64) {
+      status = on_gpu ? RingAllgather<GPU_DEVICE_IF_CUDA, long long>(
+                            context, input_tensor, sizes_vec, output_tensor)
+                      : RingAllgather<CPUDevice, long long>(
+                            context, input_tensor, sizes_vec, output_tensor);
+    } else {
+      status = errors::Unknown("Invalid tensor type for MPI allgather.");
+    }
+  } else if (response.response_type() == MPIResponse::ALLREDUCE) {
+    if (dtype == DT_FLOAT) {
+      status = on_gpu ? RingAllreduce<GPU_DEVICE_IF_CUDA, float>(
+                            context, input_tensor, &temp_tensor, output_tensor)
+                      : RingAllreduce<CPUDevice, float>(
+                            context, input_tensor, &temp_tensor, output_tensor);
+    } else if (dtype == DT_INT32) {
+      status = on_gpu ? RingAllreduce<GPU_DEVICE_IF_CUDA, int>(
+                            context, input_tensor, &temp_tensor, output_tensor)
+                      : RingAllreduce<CPUDevice, int>(
+                            context, input_tensor, &temp_tensor, output_tensor);
+    } else if (dtype == DT_INT64) {
+      status = on_gpu ? RingAllreduce<GPU_DEVICE_IF_CUDA, long long>(
+                            context, input_tensor, &temp_tensor, output_tensor)
+                      : RingAllreduce<CPUDevice, long long>(
+                            context, input_tensor, &temp_tensor, output_tensor);
+    } else {
+      status = errors::Unknown("Invalid tensor type for MPI allreduce.");
+    }
+  } else if (response.response_type() == MPIResponse::ERROR) {
+    status = errors::FailedPrecondition(response.error_message());
+  }
+
+  if (status.ok()) {
+    callback(StatusOr<Tensor>(*output_tensor));
+  } else {
+    callback(StatusOr<Tensor>(status));
+  }
+}
+
+// The MPI background thread loop coordinates all the MPI processes and the
+// tensor reductions. The design of the communicator mechanism is limited by a
+// few considerations:
+//
+//      1. Some MPI implementations require all MPI calls to happen from a
+//      single thread. Since TensorFlow may use several threads for graph
+//      processing, this means we must have our own dedicated thread for
+//      dealing with MPI.
+//      2. We want to gracefully handle errors, when MPI processes do not
+//      properly agree upon what should happen (such as mismatched types or
+//      shapes). To do so requires the MPI processes to know about the shapes
+//      and types of the relevant tensors on the other processes.
+//      3. The MPI reductions and gathers should be able to happen in parallel
+//      with other ongoing operations. Since MPI uses an internal
+//      (inaccessible) GPU stream separate from the TF GPUDevice streams, we
+//      cannot explicitly synchronize memcpys or kernels with it. As a result,
+//      MPIAllreduce and MPIAllgather must be AsyncOpKernels to ensure proper
+//      ordering of memcpys and kernels with respect to TF streams.
+//      4. NOTE: We cannot guarantee that all the MPI processes reduce their
+//      tensors in the same order. Thus, there must be a way to ensure the
+//      reduction memcpys and kernels occur for correct tensors across all
+//      ranks at the same time. We choose to use a coordinator (rank ID 0) to
+//      gather and trigger the reduction operations that are ready to execute.
+//
+// The coordinator currently follows a master-worker paradigm. Rank zero acts
+// as the master (the "coordinator"), whereas all other ranks are simply
+// workers. Each rank runs its own background thread which progresses in ticks.
+// In each tick, the following actions happen:
+//
+//      a) The workers send any available MPIRequests to the coordinator. These
+//      MPIRequests indicate what the worker would like to do (i.e. which
+//      tensor they would like to gather or reduce, as well as their shape and
+//      type). They repeat this for every tensor that they would like to
+//      operate on after that tensor's collective op has executed ComputeAsync.
+//
+//      b) The workers send an empty "DONE" message to the coordinator to
+//      indicate that there are no more tensors they wish to operate on.
+//
+//      c) The coordinator receives the MPIRequests from the workers, as well
+//      as from its own TensorFlow ops, and stores them in a request table. The
+//      coordinator continues to receive MPIRequest messages until it has
+//      received MPI_SIZE number of empty "DONE" messages.
+//
+//      d) The coordinator finds all tensors that are ready to be reduced,
+//      gathered, or all operations that result in an error. For each of those,
+//      it sends an MPIResponse to all the workers. When no more MPIResponses
+//      are available, it sends a "DONE" response to the workers. If the
+//      process is being shutdown, it instead sends a "SHUTDOWN" response.
+//
+//      e) The workers listen for MPIResponse messages, processing each one by
+//      doing the required reduce or gather, until they receive a "DONE"
+//      response from the coordinator. At that point, the tick ends.
+//      If instead of "DONE" they receive "SHUTDOWN", they exit their
+//      background loop.
+// TODO: Use the global mpi_global state variable instead of a local one
+void BackgroundThreadLoop() {
+#if GOOGLE_CUDA
+  // Set the device, so that this thread uses the same GPU context as the
+  // calling thread.
+  // TODO: Ensure that this is operating correctly. The background thread
+  // needs to be able to control all GPUs that the rank has access to, and
+  // might be more than 1 GPU. Tensors could be resident in any of the
+  // GPUs, so the background thread's accumulate and copy kernels might need
+  // to correctly set the device and it might be necessary for the background
+  // thread to manage multiple streams.
+  cudaSetDevice(mpi_global.device);
+  cudaStreamCreate(&mpi_global.stream);
+#endif
+
+  // Initialize MPI. This must happen on the background thread, since not all
+  // MPI implementations support being called from multiple threads.
+  auto init_result = MPI_Init(NULL, NULL);
+  if (init_result != MPI_SUCCESS) {
+    mpi_global.init_status =
+        errors::Unknown("Could not initialize MPI; MPI_Init() failed.");
+    mpi_global.initialization_done = true;
+    mpi_global.cv.notify_all();
+    return;
+  } else {
+    mpi_global.init_status = Status::OK();
+  }
+
+  // Get MPI rank to determine if we are rank zero.
+  int rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  bool is_coordinator = rank == 0;
+
+  // Get MPI size to determine how many tensors to wait for before reducing.
+  int size;
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+  // Determine local rank by querying the local communicator.
+  MPI_Comm local_comm;
+  MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,
+                      &local_comm);
+  int local_rank;
+  MPI_Comm_rank(local_comm, &local_rank);
+
+  mpi_global.rank = rank;
+  mpi_global.local_rank = local_rank;
+  mpi_global.size = size;
+  mpi_global.initialization_done = true;
+
+  // Notify calling thread that initialization is complete
+  mpi_global.cv.notify_all();
+
+  // TODO: MOVE MESSAGE TABLE INITIALIZATION TO LIBRARY LOAD!
+  // Initialize the tensor count table. No tensors are available yet.
+  if (is_coordinator) {
+    mpi_global.message_table =
+        std::unique_ptr<MessageTable>(new MessageTable());
+  }
+
+  // The coordinator sends a SHUTDOWN message to trigger shutdown.
+  bool should_shut_down = false;
+  do {
+    // TODO: Eliminate the need for thread sleep by making all activity
+    // depend on other activity (e.g. condition or MPI waits).
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+
+    // Copy the data structures from global state under this lock.
+    // However, don't keep the lock for the rest of the loop, so that
+    // enqueued stream callbacks can continue.
+    std::queue<MPIRequest> message_queue;
+    {
+      mutex_lock guard(mpi_global.mu);
+      while (!mpi_global.message_queue.empty()) {
+        MPIRequest message = mpi_global.message_queue.front();
+        mpi_global.message_queue.pop();
+        message_queue.push(message);
+      }
+    }
+
+    // Collect all tensors that are ready to be reduced. Record them in the
+    // tensor count table (rank zero) or send them to rank zero to be
+    // recorded (everyone else).
+    std::vector<std::string> ready_to_reduce;
+    while (!message_queue.empty()) {
+      // Pop the first available message message
+      MPIRequest message = message_queue.front();
+      message_queue.pop();
+
+      if (is_coordinator) {
+        bool reduce =
+            IncrementTensorCount(mpi_global.message_table, message, size);
+        if (reduce) {
+          ready_to_reduce.push_back(message.tensor_name());
+        }
+      } else {
+        std::string encoded_message;
+        message.SerializeToString(&encoded_message);
+        MPI_Send(encoded_message.c_str(), encoded_message.length() + 1,
+                 MPI_BYTE, RANK_ZERO, TAG_NOTIFY, MPI_COMM_WORLD);
+      }
+    }
+
+    // Rank zero has put all its own tensors in the tensor count table.
+    // Now, it should count all the tensors that are coming from other
+    // ranks at this tick. It should keep getting tensors until it gets a
+    // DONE message from all the other ranks.
+    if (is_coordinator) {
+      // Count of DONE messages. Keep receiving messages until the number
+      // of messages is equal to the number of processes. Initialize to
+      // one since the coordinator is effectively done.
+      int completed_ranks = 1;
+      while (completed_ranks != size) {
+        MPI_Status status;
+        MPI_Probe(MPI_ANY_SOURCE, TAG_NOTIFY, MPI_COMM_WORLD, &status);
+
+        // Find number of characters in message (including zero byte).
+        int source_rank = status.MPI_SOURCE;
+        int msg_length;
+        MPI_Get_count(&status, MPI_BYTE, &msg_length);
+
+        // If the length is zero, this is a DONE message.
+        if (msg_length == 0) {
+          completed_ranks++;
+          MPI_Recv(NULL, 0, MPI_BYTE, source_rank, TAG_NOTIFY, MPI_COMM_WORLD,
+                   &status);
+          continue;
+        }
+
+        // Get tensor name from MPI into an std::string.
+        char* buffer = new char[msg_length];
+        MPI_Recv(buffer, msg_length, MPI_BYTE, source_rank, TAG_NOTIFY,
+                 MPI_COMM_WORLD, &status);
+        std::string received_data(buffer);
+        delete[] buffer;
+
+        MPIRequest received_message;
+        received_message.ParseFromString(received_data);
+        auto received_name = received_message.tensor_name();
+
+        bool reduce = IncrementTensorCount(mpi_global.message_table,
+                                           received_message, size);
+        if (reduce) {
+          ready_to_reduce.push_back(received_name);
+        }
+      }
+
+      // At this point, rank zero should have a fully updated tensor
+      // count table and should know all the tensors that need to be
+      // reduced or gathered, and everyone else should have sent all
+      // their information to rank zero. We can now do reductions and
+      // gathers; rank zero will choose which ones and in what order,
+      // and will notify the other ranks before doing each reduction.
+      for (int i = 0; i < ready_to_reduce.size(); i++) {
+        // Notify all nodes which tensor we'd like to reduce now
+        auto name = ready_to_reduce[i];
+        MPIResponse response =
+            ConstructMPIResponse(mpi_global.message_table, name);
+
+        std::string encoded_response;
+        response.SerializeToString(&encoded_response);
+        for (int r = 1; r < size; r++) {
+          MPI_Send(encoded_response.c_str(), encoded_response.length() + 1,
+                   MPI_BYTE, r, TAG_NOTIFY, MPI_COMM_WORLD);
+        }
+
+        // Perform the reduction. All nodes should end up performing
+        // the same reduction.
+        PerformCollectiveOp(mpi_global.tensor_table, response);
+      }
+
+      // Notify all nodes that we are done with the reductions for this
+      // tick.
+      MPIResponse done_response;
+      should_shut_down = mpi_global.shut_down;
+      done_response.set_response_type(
+          mpi_global.shut_down ? MPIResponse::SHUTDOWN : MPIResponse::DONE);
+      std::string encoded_response;
+      done_response.SerializeToString(&encoded_response);
+      for (int r = 1; r < size; r++) {
+        MPI_Send(encoded_response.c_str(), encoded_response.length() + 1,
+                 MPI_BYTE, r, TAG_NOTIFY, MPI_COMM_WORLD);
+      }
+    } else {
+      // Notify the coordinator that this node is done sending messages.
+      // A DONE message is encoded as a zero-length message.
+      MPI_Send(NULL, 0, MPI_BYTE, RANK_ZERO, TAG_NOTIFY, MPI_COMM_WORLD);
+
+      // Receive names for tensors to reduce from rank zero. Once we
+      // receive a empty DONE message, stop waiting for more names.
+      while (true) {
+        MPI_Status status;
+        MPI_Probe(0, TAG_NOTIFY, MPI_COMM_WORLD, &status);
+
+        // Find number of characters in message (including zero byte).
+        int msg_length;
+        MPI_Get_count(&status, MPI_BYTE, &msg_length);
+
+        // Get tensor name from MPI into an std::string.
+        char* buffer = new char[msg_length];
+        MPI_Recv(buffer, msg_length, MPI_BYTE, 0, TAG_NOTIFY, MPI_COMM_WORLD,
+                 &status);
+        std::string received_message(buffer);
+        delete[] buffer;
+
+        MPIResponse response;
+        response.ParseFromString(received_message);
+        if (response.response_type() == MPIResponse::DONE) {
+          // No more messages this tick
+          break;
+        } else if (response.response_type() == MPIResponse::SHUTDOWN) {
+          // No more messages this tick, and the background thread
+          // should shut down
+          should_shut_down = true;
+          break;
+        } else {
+          // Process the current message
+          PerformCollectiveOp(mpi_global.tensor_table, response);
+        }
+      }
+    }
+  } while (!should_shut_down);
+
+  MPI_Finalize();
+}
+
+// Initialize MPI and start the MPI background thread. Ensure that this is
+// only done once no matter how many times this function is called.
+Status InitializeMPIOnce(bool gpu) {
+  // Ensure MPI is only initialized once.
+  if (mpi_global.initialized_flag.test_and_set()) return mpi_global.init_status;
+
+  mpi_global.device = -1;
+#if GOOGLE_CUDA
+  if (gpu) {
+    cudaGetDevice(&mpi_global.device);
+  }
+#endif
+
+  // Start the MPI background thread, which assumes MPI is initialized
+  // TODO: Change this to a Tensorflow thread
+  mpi_global.background_thread = std::thread(BackgroundThreadLoop);
+
+  // Wait to ensure that the background thread has finished initializing MPI
+  mutex_lock guard(mpi_global.mu);
+  mpi_global.cv.wait(guard);
+  if (!mpi_global.initialization_done) {
+    mpi_global.init_status =
+        errors::Unknown("Failed to wait for MPI initialization.");
+  }
+
+  return mpi_global.init_status;
+}
+
+// Check that MPI is initialized.
+Status IsMPIInitialized() {
+  if (!mpi_global.initialization_done) {
+    return errors::FailedPrecondition(
+        "MPI has not been initialized; use tf.contrib.mpi.Session.");
+  }
+  return Status::OK();
+}
+
+// This function (called from the callback set up in MPIAll*Op::ComputeAsync)
+// only adds the op's record into the local op queue (to track the op's
+// progress), and sends a message to the coordinator indicating that this rank
+// is ready to begin. The MPI background thread will handle the MPI message.
+void EnqueueTensorCollective(CollectiveOpRecord record,
+                             MPIRequest::RequestType rtype) {
+  const Tensor* input_tensor = record.in_t;
+  MPIRequest message;
+  message.set_request_rank(record.rank);
+  message.set_tensor_name(record.name);
+  message.set_tensor_type(record.dtype);
+  message.set_request_type(rtype);
+  input_tensor->shape().AsProto(message.mutable_tensor_shape());
+
+  mutex_lock guard(mpi_global.mu);
+  mpi_global.tensor_table.emplace(record.name, record);
+  mpi_global.message_queue.push(message);
+}
+
+}  // namespace
+
+#if GOOGLE_CUDA
+cudaStream_t CudaStreamForMPI() { return mpi_global.stream; }
+#endif
+
+// Op to initialize MPI in the current process. The settings used in the
+// configuration are the same that must be used for all future MPI ops.
+template <typename Device>
+class MPIInitOp : public OpKernel {
+ public:
+  explicit MPIInitOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    bool on_gpu = IsGPUDevice<Device>();
+    OP_REQUIRES_OK(context, InitializeMPIOnce(on_gpu));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MPIInit").Device(DEVICE_CPU),
+                        MPIInitOp<CPUDevice>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("MPIInit").Device(DEVICE_GPU),
+                        MPIInitOp<GPUDevice>);
+#endif
+
+REGISTER_OP("MPIInit").Doc(R"doc(
+Initialize MPI for the current process.
+
+If this is run on a GPU, then that GPU must be used for all future MPI
+operations. If it is run on CPU, then all future MPI operations must also
+run on CPU.
+)doc");
+
+// Op to get the current MPI Size.
+template <typename Device>
+class MPISizeOp : public OpKernel {
+ public:
+  explicit MPISizeOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    OP_REQUIRES_OK(context, IsMPIInitialized());
+
+    // Write integer to output tensor
+    Tensor* output;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({}), &output));
+
+    auto flat = output->flat<int>();
+    flat(0) = mpi_global.size;
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MPISize").Device(DEVICE_CPU),
+                        MPISizeOp<CPUDevice>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("MPISize").Device(DEVICE_GPU).HostMemory("size"),
+                        MPISizeOp<GPUDevice>);
+#endif
+
+REGISTER_OP("MPISize")
+    .Output("size: int32")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Returns the number of running MPI processes.
+
+More precisely, returns the number of MPI processes in the group associated
+with the MPI_COMM_WORLD communicator.
+
+size:   Size of the MPI group.
+)doc");
+
+// Op to get the current MPI Rank.
+template <typename Device>
+class MPIRankOp : public OpKernel {
+ public:
+  explicit MPIRankOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    OP_REQUIRES_OK(context, IsMPIInitialized());
+
+    // Write integer to output tensor
+    Tensor* output;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({}), &output));
+
+    auto flat = output->flat<int>();
+    flat(0) = mpi_global.rank;
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MPIRank").Device(DEVICE_CPU),
+                        MPIRankOp<CPUDevice>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("MPIRank").Device(DEVICE_GPU).HostMemory("rank"),
+                        MPIRankOp<GPUDevice>);
+#endif
+
+REGISTER_OP("MPIRank")
+    .Output("rank: int32")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Returns the index of the current process in the MPI group.
+
+More precisely, returns the rank of the calling process in the MPI_COMM_WORLD
+communicator.
+
+rank:   Rank of the calling process.
+)doc");
+
+// Op to get the current local MPI Rank.
+template <typename Device>
+class MPILocalRankOp : public OpKernel {
+ public:
+  explicit MPILocalRankOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    OP_REQUIRES_OK(context, IsMPIInitialized());
+
+    // Write integer to output tensor
+    Tensor* output;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({}), &output));
+
+    auto flat = output->flat<int>();
+    flat(0) = mpi_global.local_rank;
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MPILocalRank").Device(DEVICE_CPU),
+                        MPILocalRankOp<CPUDevice>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(
+    Name("MPILocalRank").Device(DEVICE_GPU).HostMemory("rank"),
+    MPILocalRankOp<GPUDevice>);
+#endif
+
+REGISTER_OP("MPILocalRank")
+    .Output("rank: int32")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Returns the index of the current process in the node it is on.
+
+More precisely, returns the rank of the calling process in communicator that
+only spans the MPI processes running on that node.
+
+rank:   Rank of the calling process on the node it is on.
+)doc");
+
+template <typename Device>
+class MPIAllreduceOp : public AsyncOpKernel {
+ public:
+  explicit MPIAllreduceOp(OpKernelConstruction* context)
+      : AsyncOpKernel(context) {}
+
+  // Although this op is handled asynchronously, the ComputeAsync call is
+  // very inexpensive. It only sets up a CollectiveOpRecord and places it
+  // in the table for the background thread to handle. Thus, we do not need
+  // a TF pool thread to perform the op.
+  bool IsExpensive() override { return false; }
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
+    OP_REQUIRES_OK_ASYNC(context, IsMPIInitialized(), done);
+    const Tensor* input_tensor = &context->input(0);
+    Tensor* output_tensor;
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        context->allocate_output(0, input_tensor->shape(), &output_tensor),
+        done);
+
+    // Record allocated on stack so op can fail without memory leak
+    CollectiveOpRecord record;
+    record.name = name();
+    record.context = context;
+    record.in_t = input_tensor;
+    record.out_t = output_tensor;
+    record.on_gpu = IsGPUDevice<Device>();
+    record.dtype = input_tensor->dtype();
+
+    const size_t temp_size =
+        (input_tensor->NumElements() + mpi_global.size - 1) / mpi_global.size;
+    TensorShape temp_shape;
+    temp_shape.AddDim(temp_size);
+    OP_REQUIRES_OK_ASYNC(context,
+                         context->allocate_temp(input_tensor->dtype(),
+                                                temp_shape, &record.temp_t),
+                         done);
+
+    auto allreduce_done_callback = [done, context](StatusOr<Tensor> status) {
+      context->SetStatus(status.status());
+      done();
+    };
+    record.callback = allreduce_done_callback;
+
+    auto allreduce_launch_callback = [record] {
+      EnqueueTensorCollective(record, MPIRequest::ALLREDUCE);
+    };
+
+    // If we are on a CPU, our device context will be null and we can't
+    // get a stream to enqueue this on. On a CPU this op is called when the
+    // data is already available, so we can just immediately do the
+    // allreduce; we don't have to wait for the data to get populated.
+#if GOOGLE_CUDA
+    auto device_context = context->op_device_context();
+    if (device_context == nullptr) {
+      allreduce_launch_callback();
+    } else {
+      auto stream = device_context->stream();
+      stream->ThenDoHostCallback(allreduce_launch_callback);
+    }
+#else
+    allreduce_launch_callback();
+#endif
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MPIAllreduce").Device(DEVICE_CPU),
+                        MPIAllreduceOp<CPUDevice>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("MPIAllreduce").Device(DEVICE_GPU),
+                        MPIAllreduceOp<GPUDevice>);
+#endif
+
+REGISTER_OP("MPIAllreduce")
+    .Attr("T: {int32, int64, float32}")
+    .Input("tensor: T")
+    .Output("sum: T")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Perform an MPI Allreduce on a tensor. All other processes that do a reduction
+on a tensor with the same name must have the same dimension for that tensor.
+Tensors are reduced with other tensors that have the same node name for the
+allreduce.
+
+Arguments
+    tensor:     A tensor to reduce.
+
+Output
+    sum:        A tensor with the same shape as `tensor`, summed across all
+                MPI processes.
+)doc");
+
+template <typename Device>
+class MPIAllgatherOp : public AsyncOpKernel {
+ public:
+  explicit MPIAllgatherOp(OpKernelConstruction* context)
+      : AsyncOpKernel(context) {}
+
+  // Although this op is handled asynchronously, the ComputeAsync call is
+  // very inexpensive. It only sets up a CollectiveOpRecord and places it
+  // in the table for the background thread to handle. Thus, we do not need
+  // a TF pool thread to perform the op.
+  bool IsExpensive() override { return false; }
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
+    OP_REQUIRES_OK_ASYNC(context, IsMPIInitialized(), done);
+    const Tensor* input_tensor = &context->input(0);
+    const Tensor* sizing_tensor = &context->input(1);
+
+    // Record allocated on stack so op can fail without memory leak
+    CollectiveOpRecord record;
+    record.name = name();
+    record.context = context;
+    record.in_t = input_tensor;
+    record.on_gpu = IsGPUDevice<Device>();
+
+    // Construct the output size from the sizing tensor
+    size_t output_first_dim = 0;
+    if (sizing_tensor->shape().dims() == 0) {
+      // 0-dim sizing_tensor implies that the op is just gathering
+      // a single element from each rank
+      output_first_dim = mpi_global.size;
+      for (int i = 0; i < mpi_global.size; i++) {
+        record.sizes_vec.push_back(1);
+      }
+    } else {
+      // Collect the total output tensor sizing from the sizing tensor
+      // NOTE: The sizing tensor is forced to be placed on the CPU by
+      // declaring the input as HostMemory, so it is valid to read it here.
+      const int64* sizing_array =
+          (const int64*)sizing_tensor->tensor_data().data();
+      for (int i = 0; i < mpi_global.size; i++) {
+        record.sizes_vec.push_back(sizing_array[i]);
+        output_first_dim += sizing_array[i];
+      }
+    }
+
+    TensorShape output_shape;
+    output_shape.AddDim(output_first_dim);
+    for (int i = 1; i < input_tensor->shape().dims(); i++) {
+      output_shape.AddDim(input_tensor->shape().dim_size(i));
+    }
+
+    Tensor* output_tensor;
+    OP_REQUIRES_OK_ASYNC(
+        context, context->allocate_output(0, output_shape, &output_tensor),
+        done);
+
+    record.out_t = output_tensor;
+    record.dtype = input_tensor->dtype();
+
+    auto allgather_done_callback = [done, context](StatusOr<Tensor> status) {
+      context->SetStatus(status.status());
+      done();
+    };
+    record.callback = allgather_done_callback;
+
+    auto allgather_launch_callback = [record] {
+      EnqueueTensorCollective(record, MPIRequest::ALLGATHER);
+    };
+
+    // If we are on a CPU, our device context will be null and we can't
+    // get a stream to enqueue this on. On a CPU this op is called when the
+    // data is already available, so we can just immediately do the
+    // allgather; we don't have to wait for the data to get populated.
+#if GOOGLE_CUDA
+    auto device_context = context->op_device_context();
+    if (device_context == nullptr) {
+      allgather_launch_callback();
+    } else {
+      auto stream = device_context->stream();
+      stream->ThenDoHostCallback(allgather_launch_callback);
+    }
+#else
+    allgather_launch_callback();
+#endif
+  }
+};
+
+REGISTER_OP("MPIAllgather")
+    .Attr("T: {int32, int64, float32}")
+    .Attr("S: {int64}")
+    .Input("tensor: T")
+    .Input("sizes: S")
+    .Output("gathered: T")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle output;
+      TF_RETURN_IF_ERROR(
+          c->ReplaceDim(c->input(0), 0, c->UnknownDim(), &output));
+      c->set_output(0, output);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Perform an MPI Allgather on a tensor. All other processes that do a gather on a
+tensor with the same name must have the same rank for that tensor, and have the
+same dimension on all but the first dimension.
+
+Arguments
+    tensor:     A tensor to gather.
+    sizes:      A tensor containing the first-dimension sizes of tensors to be
+                gathered from other ranks
+
+Output
+    gathered:   A tensor with the same shape as `tensor` except for the first
+                dimension, which is the sum of dimensions in `sizes`.
+)doc");
+
+REGISTER_KERNEL_BUILDER(
+    Name("MPIAllgather").Device(DEVICE_CPU).HostMemory("sizes"),
+    MPIAllgatherOp<CPUDevice>);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(
+    Name("MPIAllgather").Device(DEVICE_GPU).HostMemory("sizes"),
+    MPIAllgatherOp<GPUDevice>);
+#endif
+
+}  // namespace mpi
+}  // namespace contrib
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/mpi_collectives/mpi_ops.py b/tensorflow/contrib/mpi_collectives/mpi_ops.py
new file mode 100644
index 0000000000..81567cc688
--- /dev/null
+++ b/tensorflow/contrib/mpi_collectives/mpi_ops.py
@@ -0,0 +1,165 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Inter-process communication using MPI."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import load_library
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.platform import tf_logging as logging
+
+
+def _load_library(name, op_list=None):
+  """Loads a .so file containing the specified operators.
+
+  Args:
+    name: The name of the .so file to load.
+    op_list: A list of names of operators that the library should have. If None
+        then the .so file's contents will not be verified.
+
+  Raises:
+    NameError if one of the required ops is missing.
+  """
+  try:
+    filename = resource_loader.get_path_to_datafile(name)
+    library = load_library.load_op_library(filename)
+    for expected_op in (op_list or []):
+      for lib_op in library.OP_LIST.op:
+        if lib_op.name == expected_op:
+          break
+      else:
+        raise NameError(
+          'Could not find operator %s in dynamic library %s' %
+          (expected_op, name))
+    return library
+  except errors.NotFoundError:
+    logging.warning('%s file could not be loaded.', name)
+
+
+MPI_LIB = _load_library('mpi_collectives.so', ['MPISize', 'MPIRank',
+                                               'MPILocalRank', 'MPIAllgather',
+                                               'MPIAllreduce'])
+
+
+def size(name=None):
+  """An op which returns the number of MPI processes.
+
+  This is equivalent to running `MPI_Comm_size(MPI_COMM_WORLD, ...)` to get the
+  size of the global communicator.
+
+  Returns:
+    An integer scalar containing the number of MPI processes.
+  """
+  return MPI_LIB.mpi_size(name=name)
+
+
+ops.NotDifferentiable('MPISize')
+
+
+def rank(name=None):
+  """An op which returns the MPI rank of the calling process.
+
+  This is equivalent to running `MPI_Comm_rank(MPI_COMM_WORLD, ...)` to get the
+  rank of the current process in the global communicator.
+
+  Returns:
+    An integer scalar with the MPI rank of the calling process.
+  """
+  return MPI_LIB.mpi_rank(name=name)
+
+
+ops.NotDifferentiable('MPIRank')
+
+
+def init(name=None):
+  """An op which initializes MPI on the device on which it is run.
+
+  All future MPI ops must be run on the same device that the `init` op was run
+  on.
+  """
+  return MPI_LIB.mpi_init(name=name)
+
+
+ops.NotDifferentiable('MPIInit')
+
+
+def local_rank(name=None):
+  """An op which returns the local MPI rank of the calling process, within the
+  node that it is running on. For example, if there are seven processes running
+  on a node, their local ranks will be zero through six, inclusive.
+
+  This is equivalent to running `MPI_Comm_rank(...)` on a new communicator
+  which only includes processes on the same node.
+
+  Returns:
+    An integer scalar with the local MPI rank of the calling process.
+  """
+  return MPI_LIB.mpi_local_rank(name=name)
+
+
+ops.NotDifferentiable('MPILocalRank')
+
+
+def _allreduce(tensor, name=None):
+  """An op which sums an input tensor over all the MPI processes.
+
+  The reduction operation is keyed by the name of the op. The tensor type and
+  shape must be the same on all MPI processes for a given name. The reduction
+  will not start until all processes are ready to send and receive the tensor.
+
+  Returns:
+    A tensor of the same shape and type as `tensor`, summed across all
+    processes.
+  """
+  return MPI_LIB.mpi_allreduce(tensor, name=name)
+
+
+ops.NotDifferentiable('MPIAllreduce')
+
+
+def allgather(tensor, name=None):
+  """An op which concatenates the input tensor with the same input tensor on
+  all other MPI processes.
+
+  The concatenation is done on the first dimension, so the input tensors on the
+  different processes must have the same rank and shape, except for the first
+  dimension, which is allowed to be different.
+
+  Returns:
+    A tensor of the same type as `tensor`, concatenated on dimension zero
+    across all processes. The shape is identical to the input shape, except for
+    the first dimension, which may be greater and is the sum of all first
+    dimensions of the tensors in different MPI processes.
+  """
+  # Specify that first allgather is to collect the tensor gather sizes,
+  # indicated by passing in a scalar (0-D tensor) of value 0
+  sizes_flag = tf.constant(0, dtype=tf.int64, name="size_flag_const")
+  my_size = tf.slice(tf.shape(tensor, out_type=tf.int64), [0], [1], name="size_slice")
+  if name is None:
+    name = "allgather"
+  sizing_name = "{}_sizing".format(name)
+  sizes = MPI_LIB.mpi_allgather(my_size, sizes_flag, name=sizing_name)
+  return MPI_LIB.mpi_allgather(tensor, sizes, name=name)
+
+
+ops.NotDifferentiable('MPIAllgather')
+
+
diff --git a/tensorflow/contrib/mpi_collectives/mpi_ops_test.py b/tensorflow/contrib/mpi_collectives/mpi_ops_test.py
new file mode 100644
index 0000000000..48e5c0a0c7
--- /dev/null
+++ b/tensorflow/contrib/mpi_collectives/mpi_ops_test.py
@@ -0,0 +1,296 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Tests for tensorflow.contrib.mpi_collectives.mpi_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os.path
+import itertools
+
+import tensorflow as tf
+
+import tensorflow.contrib.mpi_collectives as mpi
+
+
+def mpi_env_rank_and_size():
+  """Get MPI rank and size from environment variables and return them as a
+  tuple of integers.
+
+  Most MPI implementations have an `mpirun` or `mpiexec` command that will
+  run an MPI executable and set up all communication necessary between the
+  different processors. As part of that set up, they will set environment
+  variables that contain the rank and size of the MPI_COMM_WORLD
+  communicator. We can read those environment variables from Python in order
+  to ensure that `mpi.rank()` and `mpi.size()` return the expected values.
+
+  Since MPI is just a standard, not an implementation, implementations
+  typically choose their own environment variable names. This function tries
+  to support several different implementation, but really it only needs to
+  support whatever implementation we want to use for the TensorFlow test
+  suite.
+
+  If this is not running under MPI, then defaults of rank zero and size one
+  are returned. (This is appropriate because when you call MPI_Init in an
+  application not started with mpirun, it will create a new independent
+  communicator with only one process in it.)
+  """
+  rank_env = "PMI_RANK OMPI_COMM_WORLD_RANK".split()
+  size_env = "PMI_SIZE OMPI_COMM_WORLD_SIZE".split()
+
+  for rank_var, size_var in zip(rank_env, size_env):
+    rank = os.environ.get(rank_var)
+    size = os.environ.get(size_var)
+    if rank is not None and size is not None:
+      return int(rank), int(size)
+
+  # Default to rank zero and size one if there are no environment variables
+  return 0, 1
+
+
+class MPITests(tf.test.TestCase):
+  """
+  Tests for MPI ops in tensorflow.contrib.mpi_collectives.
+  """
+
+  def test_mpi_rank(self):
+    """Test that the rank returned by mpi.rank() is correct."""
+    true_rank, _ = mpi_env_rank_and_size()
+    with self.test_session() as session:
+      rank = session.run(mpi.rank())
+      self.assertEqual(true_rank, rank)
+
+  def test_mpi_size(self):
+    """Test that the size returned by mpi.size() is correct."""
+    _, true_size = mpi_env_rank_and_size()
+    with self.test_session() as session:
+      size = session.run(mpi.size())
+      self.assertEqual(true_size, size)
+
+  def test_mpi_allreduce_cpu(self):
+    """Test on CPU that the allreduce correctly sums 1D, 2D, 3D tensors."""
+    with self.test_session() as session:
+      size = session.run(mpi.size())
+
+      dtypes = [tf.int32, tf.float32]
+      dims = [1, 2, 3]
+      for dtype, dim in itertools.product(dtypes, dims):
+        tf.set_random_seed(1234)
+        tensor = tf.random_uniform([17] * dim, -100, 100, dtype=dtype)
+        summed = mpi.allreduce(tensor, average=False)
+        multiplied = tensor * size
+        max_difference = tf.reduce_max(tf.abs(summed - multiplied))
+
+        # Threshold for floating point equality depends on number of
+        # ranks, since we're comparing against precise multiplication.
+        if size <= 3:
+          threshold = 0
+        elif size < 10:
+          threshold = 1e-4
+        elif size < 15:
+          threshold = 5e-4
+        else:
+          break
+
+        diff = session.run(max_difference)
+        self.assertTrue(diff <= threshold,
+                        "mpi.allreduce produces incorrect results")
+
+  def test_mpi_allreduce_gpu(self):
+    """Test that the allreduce works on GPUs.
+
+    This test will crash badly if used with an MPI implementation that does
+    not support GPU memory transfers directly, as it will call MPI_Send on
+    a GPU data pointer."""
+    # Only do this test if there are GPUs available.
+    if not tf.test.is_gpu_available(cuda_only=True):
+      return
+
+    no_gpus = tf.GPUOptions(visible_device_list="")
+    cpu_config = tf.ConfigProto(gpu_options=no_gpus)
+    with self.test_session(config=cpu_config) as session:
+      local_rank = session.run(mpi.local_rank())
+
+    one_gpu = tf.GPUOptions(visible_device_list=str(local_rank))
+    gpu_config = tf.ConfigProto(gpu_options=one_gpu)
+    with self.test_session(config=gpu_config) as session:
+      size = session.run(mpi.size())
+
+      dtype = tf.float32
+      dim = 3
+      with tf.device("/gpu:0"):
+        tf.set_random_seed(1234)
+        tensor = tf.random_uniform([17] * dim, -100, 100, dtype=dtype)
+        summed = mpi.allreduce(tensor, average=False)
+        multiplied = tensor * size
+        max_difference = tf.reduce_max(tf.abs(summed - multiplied))
+
+      # Threshold for floating point equality depends on number of
+      # ranks, since we're comparing against precise multiplication.
+      if size <= 3:
+        threshold = 0
+      elif size < 10:
+        threshold = 1e-4
+      elif size < 15:
+        threshold = 5e-4
+      else:
+        return
+
+      diff = session.run(max_difference)
+      self.assertTrue(diff <= threshold,
+                      "mpi.allreduce on GPU produces incorrect results")
+
+  def test_mpi_allreduce_error(self):
+    """Test that the allreduce raises an error if different ranks try to
+    send tensors of different rank or dimension."""
+    with self.test_session() as session:
+      rank = session.run(mpi.rank())
+      size = session.run(mpi.size())
+
+      # This test does not apply if there is only one worker.
+      if size == 1:
+        return
+
+      # Same rank, different dimension
+      tf.set_random_seed(1234)
+      dims = [17 + rank] * 3
+      tensor = tf.random_uniform(dims, -1.0, 1.0)
+      with self.assertRaises(tf.errors.FailedPreconditionError):
+        session.run(mpi.allreduce(tensor))
+
+      # Same number of elements, different rank
+      tf.set_random_seed(1234)
+      if rank == 0:
+        dims = [17, 23 * 57]
+      else:
+        dims = [17, 23, 57]
+      tensor = tf.random_uniform(dims, -1.0, 1.0)
+      with self.assertRaises(tf.errors.FailedPreconditionError):
+        session.run(mpi.allreduce(tensor))
+
+  def test_mpi_allreduce_type_error(self):
+    """Test that the allreduce raises an error if different ranks try to
+    send tensors of different type."""
+    with self.test_session() as session:
+      rank = session.run(mpi.rank())
+      size = session.run(mpi.size())
+
+      # This test does not apply if there is only one worker.
+      if size == 1:
+        return
+
+      # Same rank, different dimension
+      dims = [17] * 3
+      tensor = tf.ones(dims, dtype=tf.int32 if rank % 2 == 0 else tf.float32)
+      with self.assertRaises(tf.errors.FailedPreconditionError):
+        session.run(mpi.allreduce(tensor))
+
+  def test_mpi_allgather(self):
+    """Test that the allgather correctly gathers 1D, 2D, 3D tensors."""
+    with self.test_session() as session:
+      size = session.run(mpi.size())
+      rank = session.run(mpi.rank())
+
+      dtypes = tf.int32, tf.float32
+      dims = 1, 2, 3
+      for dtype, dim in itertools.product(dtypes, dims):
+        tensor = tf.ones([17] * dim, dtype=dtype) * rank
+        gathered = mpi.allgather(tensor)
+
+        gathered_tensor = session.run(gathered)
+        self.assertEqual(list(gathered_tensor.shape),
+                         [17 * size] + [17] * (dim - 1))
+
+        for i in range(size):
+          rank_tensor = tf.slice(gathered_tensor, [i * 17] + [0] * (dim - 1),
+                                 [17] + [-1] * (dim - 1))
+          self.assertEqual(list(rank_tensor.shape), [17] * dim)
+          self.assertTrue(session.run(tf.reduce_all(tf.equal(rank_tensor, i))),
+                          "mpi.allgather produces incorrect gathered tensor")
+
+  def test_mpi_allgather_variable_size(self):
+    """Test that the allgather correctly gathers 1D, 2D, 3D tensors,
+    even if those tensors have different sizes along the first dim."""
+    with self.test_session() as session:
+      size = session.run(mpi.size())
+      rank = session.run(mpi.rank())
+
+      dtypes = tf.int32, tf.float32
+      dims = 1, 2, 3
+      for dtype, dim in itertools.product(dtypes, dims):
+        # Support tests up to MPI Size of 35
+        if size > 35:
+          break
+
+        tensor_sizes = [17, 32, 81, 12, 15, 23, 22] * 5
+        tensor_sizes = tensor_sizes[:size]
+
+        tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1),
+                         dtype=dtype) * rank
+        gathered = mpi.allgather(tensor)
+
+        gathered_tensor = session.run(gathered)
+        expected_size = sum(tensor_sizes)
+        self.assertEqual(list(gathered_tensor.shape),
+                         [expected_size] + [17] * (dim - 1))
+
+        for i in range(size):
+          rank_size = [tensor_sizes[i]] + [17] * (dim - 1)
+          rank_tensor = tf.slice(gathered,
+                                 [sum(tensor_sizes[:i])] + [0] * (dim - 1),
+                                 rank_size)
+          self.assertEqual(list(rank_tensor.shape), rank_size)
+          self.assertTrue(session.run(tf.reduce_all(tf.equal(rank_tensor, i))),
+                          "mpi.allgather produces incorrect gathered tensor")
+
+  def test_mpi_allgather_error(self):
+    """Test that the allgather returns an error if any dimension besides
+    the first is different among the tensors being gathered."""
+    with self.test_session() as session:
+      rank = session.run(mpi.rank())
+      size = session.run(mpi.size())
+
+      # This test does not apply if there is only one worker.
+      if size == 1:
+        return
+
+      tensor_size = [17] * 3
+      tensor_size[1] = 10 * (rank + 1)
+      tensor = tf.ones(tensor_size, dtype=tf.float32) * rank
+      with self.assertRaises(tf.errors.FailedPreconditionError):
+        session.run(mpi.allgather(tensor))
+
+  def test_mpi_allgather_type_error(self):
+    """Test that the allgather returns an error if the types being gathered
+    differ among the processes"""
+    with self.test_session() as session:
+      rank = session.run(mpi.rank())
+      size = session.run(mpi.size())
+
+      # This test does not apply if there is only one worker.
+      if size == 1:
+        return
+
+      tensor_size = [17] * 3
+      dtype = tf.int32 if rank % 2 == 0 else tf.float32
+      tensor = tf.ones(tensor_size, dtype=dtype) * rank
+      with self.assertRaises(tf.errors.FailedPreconditionError):
+        session.run(mpi.allgather(tensor))
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/mpi_collectives/ring.cc b/tensorflow/contrib/mpi_collectives/ring.cc
new file mode 100644
index 0000000000..d93233eb21
--- /dev/null
+++ b/tensorflow/contrib/mpi_collectives/ring.cc
@@ -0,0 +1,80 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef TENSORFLOW_USE_MPI
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/contrib/mpi_collectives/ring.h"
+
+namespace tensorflow {
+namespace contrib {
+namespace mpi {
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+extern template MPI_Datatype MPIType<float>();
+extern template MPI_Datatype MPIType<int>();
+extern template MPI_Datatype MPIType<long long>();
+extern template DataType TensorFlowDataType<float>();
+extern template DataType TensorFlowDataType<int>();
+extern template DataType TensorFlowDataType<long long>();
+
+// Generate all necessary specializations for RingAllreduce.
+template Status RingAllreduce<CPUDevice, int>(OpKernelContext*, const Tensor*,
+                                              Tensor*, Tensor*);
+template Status RingAllreduce<CPUDevice, long long>(OpKernelContext*,
+                                                    const Tensor*, Tensor*,
+                                                    Tensor*);
+template Status RingAllreduce<CPUDevice, float>(OpKernelContext*, const Tensor*,
+                                                Tensor*, Tensor*);
+
+// Generate all necessary specializations for RingAllgather.
+template Status RingAllgather<CPUDevice, int>(OpKernelContext*, const Tensor*,
+                                              const std::vector<size_t>&,
+                                              Tensor*);
+template Status RingAllgather<CPUDevice, long long>(OpKernelContext*,
+                                                    const Tensor*,
+                                                    const std::vector<size_t>&,
+                                                    Tensor*);
+template Status RingAllgather<CPUDevice, float>(OpKernelContext*, const Tensor*,
+                                                const std::vector<size_t>&,
+                                                Tensor*);
+
+// Copy data on a CPU using a straight-forward memcpy.
+template <>
+void CopyTensorData<CPUDevice>(void* dst, void* src, size_t size) {
+  std::memcpy(dst, src, size);
+};
+
+// Accumulate values on a CPU.
+#define GENERATE_ACCUMULATE(type)                                    \
+  template <>                                                        \
+  void AccumulateTensorData<CPUDevice, type>(type * dst, type * src, \
+                                             size_t size) {          \
+    for (unsigned int i = 0; i < size; i++) {                        \
+      dst[i] += src[i];                                              \
+    }                                                                \
+  };
+GENERATE_ACCUMULATE(int);
+GENERATE_ACCUMULATE(long long);
+GENERATE_ACCUMULATE(float);
+#undef GENERATE_ACCUMULATE
+
+}  // namespace mpi
+}  // namespace contrib
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/mpi_collectives/ring.cu.cc b/tensorflow/contrib/mpi_collectives/ring.cu.cc
new file mode 100644
index 0000000000..2f3eef366a
--- /dev/null
+++ b/tensorflow/contrib/mpi_collectives/ring.cu.cc
@@ -0,0 +1,117 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef TENSORFLOW_USE_MPI
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/contrib/mpi_collectives/ring.h"
+
+namespace tensorflow {
+namespace contrib {
+namespace mpi {
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+template <>
+MPI_Datatype MPIType<float>() {
+  return MPI_FLOAT;
+};
+template <>
+MPI_Datatype MPIType<int>() {
+  return MPI_INT;
+};
+template <>
+MPI_Datatype MPIType<long long>() {
+  return MPI_LONG_LONG;
+};
+
+template <>
+DataType TensorFlowDataType<float>() {
+  return DT_FLOAT;
+};
+template <>
+DataType TensorFlowDataType<int>() {
+  return DT_INT32;
+};
+template <>
+DataType TensorFlowDataType<long long>() {
+  return DT_INT64;
+};
+
+// Generate all necessary specializations for RingAllreduce.
+template Status RingAllreduce<GPUDevice, int>(OpKernelContext*, const Tensor*,
+                                              Tensor*, Tensor*);
+template Status RingAllreduce<GPUDevice, long long>(OpKernelContext*,
+                                                    const Tensor*, Tensor*,
+                                                    Tensor*);
+template Status RingAllreduce<GPUDevice, float>(OpKernelContext*, const Tensor*,
+                                                Tensor*, Tensor*);
+
+// Generate all necessary specializations for RingAllgather.
+template Status RingAllgather<GPUDevice, int>(OpKernelContext*, const Tensor*,
+                                              const std::vector<size_t>&,
+                                              Tensor*);
+template Status RingAllgather<GPUDevice, long long>(OpKernelContext*,
+                                                    const Tensor*,
+                                                    const std::vector<size_t>&,
+                                                    Tensor*);
+template Status RingAllgather<GPUDevice, float>(OpKernelContext*, const Tensor*,
+                                                const std::vector<size_t>&,
+                                                Tensor*);
+
+// Synchronously copy data on the GPU, using a different stream than the default
+// and than TensorFlow to avoid synchronizing on operations unrelated to the
+// allreduce.
+template <>
+void CopyTensorData<GPUDevice>(void* dst, void* src, size_t size) {
+  auto stream = CudaStreamForMPI();
+  cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToDevice, stream);
+  cudaStreamSynchronize(stream);
+};
+
+// Elementwise accumulation kernel for GPU.
+template <typename T>
+__global__ void elemwise_accum(T* out, const T* in, const size_t N) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    out[i] += in[i];
+  }
+}
+
+// Synchronously accumulate tensors on the GPU, using a different stream than
+// the default and than TensorFlow to avoid synchronizing on operations
+// unrelated to the allreduce.
+#define GENERATE_ACCUMULATE(type)                                    \
+  template <>                                                        \
+  void AccumulateTensorData<GPUDevice, type>(type * dst, type * src, \
+                                             size_t size) {          \
+    auto stream = CudaStreamForMPI();                                \
+    elemwise_accum<type><<<32, 256, 0, stream>>>(dst, src, size);    \
+    cudaStreamSynchronize(stream);                                   \
+  };
+GENERATE_ACCUMULATE(int);
+GENERATE_ACCUMULATE(long long);
+GENERATE_ACCUMULATE(float);
+#undef GENERATE_ACCUMULATE
+
+}  // namespace mpi
+}  // namespace contrib
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/mpi_collectives/ring.h b/tensorflow/contrib/mpi_collectives/ring.h
new file mode 100644
index 0000000000..cae57ce60e
--- /dev/null
+++ b/tensorflow/contrib/mpi_collectives/ring.h
@@ -0,0 +1,327 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_MPI_H_
+#define TENSORFLOW_CONTRIB_MPI_H_
+
+#ifdef TENSORFLOW_USE_MPI
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+
+#if GOOGLE_CUDA
+#include "cuda_runtime.h"
+#endif
+
+// Needed to avoid header issues with C++-supporting MPI implementations
+#define OMPI_SKIP_MPICXX
+#include "third_party/mpi/mpi.h"
+
+#define TAG_TENSOR 12
+
+namespace tensorflow {
+namespace contrib {
+namespace mpi {
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+// Convert from templated types to values we can pass to MPI.
+template <typename T>
+MPI_Datatype MPIType();
+
+// Convert from templated types to TensorFlow data types.
+template <typename T>
+DataType TensorFlowDataType();
+
+#define MPI_REQUIRES_OK(MPI_STATUS)                               \
+  if ((MPI_STATUS) != MPI_SUCCESS) {                              \
+    return errors::Unknown("MPI operation failed unexpectedly."); \
+  }
+
+// Copy data from one tensor to another tensor.
+// This uses a custom CUDA stream on GPU, which is necessary to overlay the
+// backpropagation computations with the allreduce.
+template <typename Device>
+void CopyTensorData(void* destination, void* source, size_t size);
+
+// Add a tensor into another tensor, accumulating in place.
+// This uses a custom CUDA stream on GPU, which is necessary to overlay the
+// backpropagation computations with the allreduce.
+template <typename Device, typename T>
+void AccumulateTensorData(T* destination, T* source, size_t size);
+
+// We need to get the right stream for doing CUDA memory transfers and
+// operations, which is possibly different from the standard TensorFlow stream.
+#if GOOGLE_CUDA
+cudaStream_t CudaStreamForMPI();
+#endif
+
+/* Perform a ring allreduce on the data. Allocate the necessary output tensor
+ * and store it in the output parameter.
+ *
+ * Assumes that all MPI processes are doing an allreduce of the same tensor,
+ * with the same dimensions.
+ *
+ * A ring allreduce is a bandwidth-optimal way to do an allreduce. To do the
+ * allreduce, the nodes involved are arranged in a ring:
+ *
+ *                   .--0--.
+ *                  /       \
+ *                 3         1
+ *                  \       /
+ *                   *--2--*
+ *
+ *  Each node always sends to the next clockwise node in the ring, and receives
+ *  from the previous one.
+ *
+ *  The allreduce is done in two parts: a scatter-reduce and an allgather. In
+ *  the scatter reduce, a reduction is done, so that each node ends up with a
+ *  chunk of the final output tensor which has contributions from all other
+ *  nodes.  In the allgather, those chunks are distributed among all the nodes,
+ *  so that all nodes have the entire output tensor.
+ *
+ *  Both of these operations are done by dividing the input tensor into N
+ *  evenly sized chunks (where N is the number of nodes in the ring).
+ *
+ *  The scatter-reduce is done in N-1 steps. In the ith step, node j will send
+ *  the (j - i)th chunk and receive the (j - i - 1)th chunk, adding it in to
+ *  its existing data for that chunk. For example, in the first iteration with
+ *  the ring depicted above, you will have the following transfers:
+ *
+ *      Segment 0:  Node 0 --> Node 1
+ *      Segment 1:  Node 1 --> Node 2
+ *      Segment 2:  Node 2 --> Node 3
+ *      Segment 3:  Node 3 --> Node 0
+ *
+ *  In the second iteration, you'll have the following transfers:
+ *
+ *      Segment 0:  Node 1 --> Node 2
+ *      Segment 1:  Node 2 --> Node 3
+ *      Segment 2:  Node 3 --> Node 0
+ *      Segment 3:  Node 0 --> Node 1
+ *
+ *  After this iteration, Node 2 has 3 of the four contributions to Segment 0.
+ *  The last iteration has the following transfers:
+ *
+ *      Segment 0:  Node 2 --> Node 3
+ *      Segment 1:  Node 3 --> Node 0
+ *      Segment 2:  Node 0 --> Node 1
+ *      Segment 3:  Node 1 --> Node 2
+ *
+ *  After this iteration, Node 3 has the fully accumulated Segment 0; Node 0
+ *  has the fully accumulated Segment 1; and so on. The scatter-reduce is
+ * complete.
+ *
+ *  Next, the allgather distributes these fully accumululated chunks across all
+ * nodes. Communication proceeds in the same ring, once again in N-1 steps. At
+ * the ith step, node j will send chunk (j - i + 1) and receive chunk (j - i).
+ * For example, at the first iteration, the following transfers will occur:
+ *
+ *      Segment 0:  Node 3 --> Node 0
+ *      Segment 1:  Node 0 --> Node 1
+ *      Segment 2:  Node 1 --> Node 2
+ *      Segment 3:  Node 2 --> Node 3
+ *
+ * After the first iteration, Node 0 will have a fully accumulated Segment 0
+ * (from Node 3) and Segment 1. In the next iteration, Node 0 will send its
+ * just-received Segment 0 onward to Node 1, and receive Segment 3 from Node 3.
+ * After this has continued for N - 1 iterations, all nodes will have a the
+ * fully accumulated tensor.
+ *
+ * Each node will do (N-1) sends for the scatter-reduce and (N-1) sends for the
+ * allgather. Each send will contain K / N bytes, if there are K bytes in the
+ * original tensor on every node. Thus, each node sends and receives 2K(N - 1)/N
+ * bytes of data, and the performance of the allreduce (assuming no latency in
+ * connections) is constrained by the slowest interconnect between the nodes.
+ *
+ */
+template <typename Device, typename T>
+Status RingAllreduce(OpKernelContext* context, const Tensor* input,
+                     Tensor* temp, Tensor* output) {
+  // Acquire MPI size and rank
+  int n, r;
+  MPI_REQUIRES_OK(MPI_Comm_size(MPI_COMM_WORLD, &n));
+  MPI_REQUIRES_OK(MPI_Comm_rank(MPI_COMM_WORLD, &r));
+
+  T* buffer = (T*)output->tensor_data().data();
+
+  CopyTensorData<Device>((void*)buffer, (void*)input->tensor_data().data(),
+                         output->tensor_data().size());
+
+  // Calculate segment sizes and segment ends
+  const size_t elements_to_reduce = input->NumElements();
+  const size_t segment_size = elements_to_reduce / n;
+  std::vector<size_t> segment_sizes(n, segment_size);
+
+  const size_t residual = elements_to_reduce % n;
+  for (size_t i = 0; i < residual; ++i) {
+    segment_sizes[i]++;
+  }
+
+  std::vector<size_t> segment_starts(n);
+  segment_starts[0] = 0;
+  for (size_t i = 1; i < segment_starts.size(); ++i) {
+    segment_starts[i] = segment_starts[i - 1] + segment_sizes[i - 1];
+  }
+
+  assert(segment_starts[n - 1] + segment_sizes[n - 1] == elements_to_reduce);
+
+  T* segment_recv = (T*)temp->tensor_data().data();
+
+  // Receive from your left neighbor with wrap-around
+  const size_t recv_from = ((r - 1) + n) % n;
+
+  // Send to your right neighbor with wrap-around
+  const size_t send_to = (r + 1) % n;
+
+  MPI_Status recv_status;
+  MPI_Request recv_req;
+
+  // Now start ring. At every step, for every rank, we iterate through
+  // segments with wraparound and send and recv from our neighbors and reduce
+  // locally. At the i'th iteration, rank r, sends segment (r-i) and receives
+  // segment (r-i-1).
+  for (int i = 0; i < n - 1; i++) {
+    const size_t send_seg_id = ((r - i) + n) % n;
+    const size_t recv_seg_id = ((r - i - 1) + n) % n;
+
+    T* segment_send = &(buffer[segment_starts[send_seg_id]]);
+
+    MPI_REQUIRES_OK(MPI_Irecv(segment_recv, segment_sizes[recv_seg_id],
+                              MPIType<T>(), recv_from, TAG_TENSOR,
+                              MPI_COMM_WORLD, &recv_req));
+
+    MPI_REQUIRES_OK(MPI_Send(segment_send, segment_sizes[send_seg_id],
+                             MPIType<T>(), send_to, TAG_TENSOR,
+                             MPI_COMM_WORLD));
+
+    T* segment_update = &(buffer[segment_starts[recv_seg_id]]);
+
+    // Wait for recv to complete before reduction
+    MPI_REQUIRES_OK(MPI_Wait(&recv_req, &recv_status));
+
+    const size_t recv_seg_size = segment_sizes[recv_seg_id];
+    AccumulateTensorData<Device, T>(segment_update, segment_recv,
+                                    recv_seg_size);
+  }
+
+  // Now start pipelined ring allgather. At every step, for every rank, we
+  // iterate through segments with wraparound and send and recv from our
+  // neighbors. At the i'th iteration, rank r, sends segment (r-i+1) and
+  // receives segment (r-i).
+  for (size_t i = 0; i < n - 1; ++i) {
+    const size_t send_seg_id = ((r - i + 1) + n) % n;
+    const size_t recv_seg_id = ((r - i) + n) % n;
+
+    // Segment to send - at every iteration we send segment (r-i+1)
+    T* segment_send = &(buffer[segment_starts[send_seg_id]]);
+
+    // Segment to recv - at every iteration we receive segment (r-i)
+    T* segment_recv = &(buffer[segment_starts[recv_seg_id]]);
+
+    MPI_REQUIRES_OK(MPI_Sendrecv(
+        segment_send, segment_sizes[send_seg_id], MPIType<T>(), send_to,
+        TAG_TENSOR, segment_recv, segment_sizes[recv_seg_id], MPIType<T>(),
+        recv_from, TAG_TENSOR, MPI_COMM_WORLD, &recv_status));
+  }
+
+  return Status::OK();
+}
+
+// Perform a ring allgather on a Tensor. Other ranks may allgather with a
+// tensor which differs in the first dimension only; all other dimensions must
+// be the same.
+//
+// For more information on the ring allgather, read the documentation for the
+// ring allreduce, which includes a ring allgather.
+template <typename Device, typename T>
+Status RingAllgather(OpKernelContext* context, const Tensor* input,
+                     const std::vector<size_t>& sizes, Tensor* output) {
+  // Acquire MPI size and rank
+  int n, r;
+  MPI_REQUIRES_OK(MPI_Comm_size(MPI_COMM_WORLD, &n));
+  MPI_REQUIRES_OK(MPI_Comm_rank(MPI_COMM_WORLD, &r));
+
+  assert(sizes.size() == n);
+  assert(input->dim_size(0) == sizes[r]);
+
+  // Compute number of elements in every "row". We can't compute number of
+  // elements in every chunks, because those chunks are variable length.
+  size_t elements_per_row = 1;
+  for (int i = 1; i < input->shape().dims(); i++) {
+    elements_per_row *= input->dim_size(i);
+  }
+
+  // Copy data from input tensor to correct place in output tensor.
+  std::vector<size_t> segment_starts(n);
+  segment_starts[0] = 0;
+  for (int i = 1; i < n; i++) {
+    segment_starts[i] = segment_starts[i - 1] + elements_per_row * sizes[i - 1];
+  }
+  size_t offset = segment_starts[r];
+
+  // Copy data to the right offset for this rank.
+  T* buffer = (T*)output->tensor_data().data();
+  CopyTensorData<Device>((void*)(buffer + offset),
+                         (void*)input->tensor_data().data(),
+                         elements_per_row * sizes[r] * sizeof(T));
+
+  // Receive from your left neighbor with wrap-around
+  const size_t recv_from = ((r - 1) + n) % n;
+
+  // Send to your right neighbor with wrap-around
+  const size_t send_to = (r + 1) % n;
+
+  // Perform a ring allgather. At every step, for every rank, we iterate
+  // through segments with wraparound and send and recv from our neighbors.
+  // At the i'th iteration, rank r, sends segment (r-i) and receives segment
+  // (r-1-i).
+  MPI_Status recv_status;
+  for (size_t i = 0; i < n - 1; ++i) {
+    const size_t send_seg_id = ((r - i) + n) % n;
+    const size_t recv_seg_id = ((r - i - 1) + n) % n;
+
+    // Segment to send - at every iteration we send segment (r-i)
+    size_t offset_send = segment_starts[send_seg_id];
+    size_t rows_send = sizes[send_seg_id];
+    T* segment_send = &(buffer[offset_send]);
+
+    // Segment to recv - at every iteration we receive segment (r-1-i)
+    size_t offset_recv = segment_starts[recv_seg_id];
+    size_t rows_recv = sizes[recv_seg_id];
+    T* segment_recv = &(buffer[offset_recv]);
+
+    MPI_REQUIRES_OK(MPI_Sendrecv(
+        segment_send, elements_per_row * rows_send, MPIType<T>(), send_to,
+        TAG_TENSOR, segment_recv, elements_per_row * rows_recv, MPIType<T>(),
+        recv_from, TAG_TENSOR, MPI_COMM_WORLD, &recv_status));
+  }
+
+  return Status::OK();
+}
+
+}  // namespace mpi
+}  // namespace contrib
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_MPI
+
+#undef TENSORFLOW_CONTRIB_MPI_H_
+#endif  // TENSORFLOW_CONTRIB_MPI_H_
diff --git a/tensorflow/contrib/pi_examples/camera/Makefile b/tensorflow/contrib/pi_examples/camera/Makefile
index 182baefcd6..578f1336f3 100644
--- a/tensorflow/contrib/pi_examples/camera/Makefile
+++ b/tensorflow/contrib/pi_examples/camera/Makefile
@@ -43,13 +43,13 @@ INCLUDES := \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
 LIBS := \
--lstdc++ \
--lprotobuf \
--lv4l2 \
 -Wl,--allow-multiple-definition \
 -Wl,--whole-archive \
 -ltensorflow-core \
 -Wl,--no-whole-archive \
+-lstdc++ \
+-lprotobuf \
+-lv4l2 \
 -ldl \
 -lpthread \
 -lm \
diff --git a/tensorflow/contrib/pi_examples/label_image/Makefile b/tensorflow/contrib/pi_examples/label_image/Makefile
index 0cf71bd294..19652e581d 100644
--- a/tensorflow/contrib/pi_examples/label_image/Makefile
+++ b/tensorflow/contrib/pi_examples/label_image/Makefile
@@ -43,12 +43,12 @@ INCLUDES := \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
 LIBS := \
--lstdc++ \
--lprotobuf \
 -Wl,--allow-multiple-definition \
 -Wl,--whole-archive \
 -ltensorflow-core \
 -Wl,--no-whole-archive \
+-lstdc++ \
+-lprotobuf \
 -ldl \
 -lpthread \
 -lm \
diff --git a/tensorflow/contrib/rnn/python/ops/gru_ops.py b/tensorflow/contrib/rnn/python/ops/gru_ops.py
index bf74fd7544..75536e3f5f 100644
--- a/tensorflow/contrib/rnn/python/ops/gru_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/gru_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import resource_loader
+from tensorflow.python.util.deprecation import deprecated_args
 
 _gru_ops_so = loader.load_op_library(
     resource_loader.get_path_to_datafile("_gru_ops.so"))
@@ -129,13 +130,24 @@ class GRUBlockCell(rnn_cell_impl.RNNCell):
 
   """
 
-  def __init__(self, cell_size):
+  @deprecated_args(None, "cell_size is deprecated, use num_units instead",
+                   "cell_size")
+  def __init__(self, num_units=None, cell_size=None):
     """Initialize the Block GRU cell.
 
     Args:
-      cell_size: int, GRU cell size.
+      num_units: int, The number of units in the GRU cell.
+      cell_size: int, The old (deprecated) name for `num_units`.
+
+    Raises:
+      ValueError: if both cell_size and num_units are not None;
+        or both are None.
     """
-    self._cell_size = cell_size
+    if (cell_size is None) == (num_units is None):
+      raise ValueError("Exactly one of num_units or cell_size must be provided.")
+    if num_units is None:
+      num_units = cell_size
+    self._cell_size = num_units
 
   @property
   def state_size(self):
diff --git a/tensorflow/contrib/s3/BUILD b/tensorflow/contrib/s3/BUILD
new file mode 100644
index 0000000000..a4daed01e7
--- /dev/null
+++ b/tensorflow/contrib/s3/BUILD
@@ -0,0 +1,102 @@
+# Description:
+# S3 support for TensorFlow.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+cc_binary(
+    name = "s3_file_system.so",
+    srcs = [
+        "s3_crypto.cc",
+        "s3_crypto.h",
+        "s3_file_system.cc",
+        "s3_file_system.h",
+    ],
+    copts = ["-Wno-sign-compare"],
+    defines = select({
+        "//conditions:default": [
+            "ENABLE_CURL_CLIENT",
+            "ENABLE_NO_ENCRYPTION",
+        ],
+    }),
+    linkshared = 1,
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "@aws//:aws",
+        "@curl//:curl",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+)
+
+cc_library(
+    name = "s3_crypto",
+    srcs = [
+        "s3_crypto.cc",
+    ],
+    hdrs = [
+        "s3_crypto.h",
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "@aws//:aws",
+        "@boringssl//:crypto",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "s3_file_system",
+    srcs = [
+        "s3_file_system.cc",
+    ],
+    hdrs = [
+        "s3_file_system.h",
+    ],
+    deps = [
+        ":s3_crypto",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "@aws//:aws",
+    ],
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "s3_file_system_test",
+    size = "small",
+    srcs = [
+        "s3_file_system_test.cc",
+    ],
+    tags = [
+        "manual",
+    ],
+    deps = [
+        ":s3_file_system",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@aws//:aws",
+    ],
+)
diff --git a/tensorflow/contrib/s3/s3_crypto.cc b/tensorflow/contrib/s3/s3_crypto.cc
new file mode 100644
index 0000000000..1450384dc0
--- /dev/null
+++ b/tensorflow/contrib/s3/s3_crypto.cc
@@ -0,0 +1,113 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/s3/s3_crypto.h"
+#include <openssl/hmac.h>
+#include <openssl/sha.h>
+
+#include <aws/core/utils/crypto/HashResult.h>
+#include <aws/s3/S3Client.h>
+
+namespace tensorflow {
+
+class S3Sha256HMACOpenSSLImpl : public Aws::Utils::Crypto::HMAC {
+ public:
+  S3Sha256HMACOpenSSLImpl() {}
+
+  virtual ~S3Sha256HMACOpenSSLImpl() = default;
+
+  virtual Aws::Utils::Crypto::HashResult Calculate(
+      const Aws::Utils::ByteBuffer& toSign,
+      const Aws::Utils::ByteBuffer& secret) override {
+    unsigned int length = SHA256_DIGEST_LENGTH;
+    Aws::Utils::ByteBuffer digest(length);
+    memset(digest.GetUnderlyingData(), 0, length);
+
+    HMAC_CTX ctx;
+    HMAC_CTX_init(&ctx);
+
+    HMAC_Init_ex(&ctx, secret.GetUnderlyingData(),
+                 static_cast<int>(secret.GetLength()), EVP_sha256(), NULL);
+    HMAC_Update(&ctx, toSign.GetUnderlyingData(), toSign.GetLength());
+    HMAC_Final(&ctx, digest.GetUnderlyingData(), &length);
+    HMAC_CTX_cleanup(&ctx);
+
+    return Aws::Utils::Crypto::HashResult(std::move(digest));
+  }
+};
+
+class S3Sha256OpenSSLImpl : public Aws::Utils::Crypto::Hash {
+ public:
+  S3Sha256OpenSSLImpl() {}
+
+  virtual ~S3Sha256OpenSSLImpl() = default;
+
+  virtual Aws::Utils::Crypto::HashResult Calculate(
+      const Aws::String& str) override {
+    SHA256_CTX sha256;
+    SHA256_Init(&sha256);
+    SHA256_Update(&sha256, str.data(), str.size());
+
+    Aws::Utils::ByteBuffer hash(SHA256_DIGEST_LENGTH);
+    SHA256_Final(hash.GetUnderlyingData(), &sha256);
+
+    return Aws::Utils::Crypto::HashResult(std::move(hash));
+  }
+
+  virtual Aws::Utils::Crypto::HashResult Calculate(
+      Aws::IStream& stream) override {
+    SHA256_CTX sha256;
+    SHA256_Init(&sha256);
+
+    auto currentPos = stream.tellg();
+    if (currentPos == -1) {
+      currentPos = 0;
+      stream.clear();
+    }
+
+    stream.seekg(0, stream.beg);
+
+    char streamBuffer
+        [Aws::Utils::Crypto::Hash::INTERNAL_HASH_STREAM_BUFFER_SIZE];
+    while (stream.good()) {
+      stream.read(streamBuffer,
+                  Aws::Utils::Crypto::Hash::INTERNAL_HASH_STREAM_BUFFER_SIZE);
+      auto bytesRead = stream.gcount();
+
+      if (bytesRead > 0) {
+        SHA256_Update(&sha256, streamBuffer, static_cast<size_t>(bytesRead));
+      }
+    }
+
+    stream.clear();
+    stream.seekg(currentPos, stream.beg);
+
+    Aws::Utils::ByteBuffer hash(SHA256_DIGEST_LENGTH);
+    SHA256_Final(hash.GetUnderlyingData(), &sha256);
+
+    return Aws::Utils::Crypto::HashResult(std::move(hash));
+  }
+};
+
+std::shared_ptr<Aws::Utils::Crypto::Hash>
+S3SHA256Factory::CreateImplementation() const {
+  return Aws::MakeShared<S3Sha256OpenSSLImpl>(S3CryptoAllocationTag);
+}
+
+std::shared_ptr<Aws::Utils::Crypto::HMAC>
+S3SHA256HmacFactory::CreateImplementation() const {
+  return Aws::MakeShared<S3Sha256HMACOpenSSLImpl>(S3CryptoAllocationTag);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/s3/s3_crypto.h b/tensorflow/contrib/s3/s3_crypto.h
new file mode 100644
index 0000000000..e376b8b0c0
--- /dev/null
+++ b/tensorflow/contrib/s3/s3_crypto.h
@@ -0,0 +1,35 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <aws/core/Aws.h>
+#include <aws/core/utils/crypto/Factories.h>
+#include <aws/core/utils/crypto/HMAC.h>
+#include <aws/core/utils/crypto/Hash.h>
+
+namespace tensorflow {
+static const char* S3CryptoAllocationTag = "S3CryptoAllocation";
+
+class S3SHA256Factory : public Aws::Utils::Crypto::HashFactory {
+ public:
+  std::shared_ptr<Aws::Utils::Crypto::Hash> CreateImplementation()
+      const override;
+};
+
+class S3SHA256HmacFactory : public Aws::Utils::Crypto::HMACFactory {
+ public:
+  std::shared_ptr<Aws::Utils::Crypto::HMAC> CreateImplementation()
+      const override;
+};
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/s3/s3_file_system.cc b/tensorflow/contrib/s3/s3_file_system.cc
new file mode 100644
index 0000000000..b09cf81d46
--- /dev/null
+++ b/tensorflow/contrib/s3/s3_file_system.cc
@@ -0,0 +1,575 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/s3/s3_file_system.h"
+#include "tensorflow/contrib/s3/s3_crypto.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/mutex.h"
+
+#include <aws/core/Aws.h>
+#include <aws/core/utils/FileSystemUtils.h>
+#include <aws/s3/S3Client.h>
+#include <aws/s3/S3Errors.h>
+#include <aws/s3/model/CopyObjectRequest.h>
+#include <aws/s3/model/DeleteObjectRequest.h>
+#include <aws/s3/model/GetObjectRequest.h>
+#include <aws/s3/model/HeadBucketRequest.h>
+#include <aws/s3/model/HeadObjectRequest.h>
+#include <aws/s3/model/ListObjectsRequest.h>
+#include <aws/s3/model/PutObjectRequest.h>
+
+#include <cstdlib>
+
+namespace tensorflow {
+
+static const char* kS3FileSystemAllocationTag = "S3FileSystemAllocation";
+static const size_t kS3ReadAppendableFileBufferSize = 1024 * 1024;
+static const int kS3GetChildrenMaxKeys = 100;
+
+Aws::Client::ClientConfiguration& GetDefaultClientConfig() {
+  static mutex cfg_lock;
+  static bool init(false);
+  static Aws::Client::ClientConfiguration cfg;
+
+  std::lock_guard<mutex> lock(cfg_lock);
+
+  if (!init) {
+    const char* endpoint = getenv("S3_ENDPOINT");
+    if (endpoint) {
+      cfg.endpointOverride = Aws::String(endpoint);
+    }
+    const char* region = getenv("S3_REGION");
+    if (region) {
+      cfg.region = Aws::String(region);
+    }
+    const char* use_https = getenv("S3_USE_HTTPS");
+    if (use_https) {
+      if (use_https[0] == '0') {
+        cfg.scheme = Aws::Http::Scheme::HTTP;
+      } else {
+        cfg.scheme = Aws::Http::Scheme::HTTPS;
+      }
+    }
+    const char* verify_ssl = getenv("S3_VERIFY_SSL");
+    if (verify_ssl) {
+      if (verify_ssl[0] == '0') {
+        cfg.verifySSL = false;
+      } else {
+        cfg.verifySSL = true;
+      }
+    }
+
+    init = true;
+  }
+
+  return cfg;
+};
+
+Status ParseS3Path(const string& fname, bool empty_object_ok, string* bucket,
+                   string* object) {
+  if (!bucket || !object) {
+    return errors::Internal("bucket and object cannot be null.");
+  }
+  StringPiece scheme, bucketp, objectp;
+  io::ParseURI(fname, &scheme, &bucketp, &objectp);
+  if (scheme != "s3") {
+    return errors::InvalidArgument("S3 path doesn't start with 's3://': ",
+                                   fname);
+  }
+  *bucket = bucketp.ToString();
+  if (bucket->empty() || *bucket == ".") {
+    return errors::InvalidArgument("S3 path doesn't contain a bucket name: ",
+                                   fname);
+  }
+  objectp.Consume("/");
+  *object = objectp.ToString();
+  if (!empty_object_ok && object->empty()) {
+    return errors::InvalidArgument("S3 path doesn't contain an object name: ",
+                                   fname);
+  }
+  return Status::OK();
+}
+
+class S3RandomAccessFile : public RandomAccessFile {
+ public:
+  S3RandomAccessFile(const string& bucket, const string& object)
+      : bucket_(bucket), object_(object) {}
+
+  Status Read(uint64 offset, size_t n, StringPiece* result,
+              char* scratch) const override {
+    Aws::S3::S3Client s3Client(GetDefaultClientConfig());
+    Aws::S3::Model::GetObjectRequest getObjectRequest;
+    getObjectRequest.WithBucket(bucket_.c_str()).WithKey(object_.c_str());
+    string bytes = strings::StrCat("bytes=", offset, "-", offset + n - 1);
+    getObjectRequest.SetRange(bytes.c_str());
+    getObjectRequest.SetResponseStreamFactory([]() {
+      return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag);
+    });
+    auto getObjectOutcome = s3Client.GetObject(getObjectRequest);
+    if (!getObjectOutcome.IsSuccess()) {
+      n = 0;
+      *result = StringPiece(scratch, n);
+      return Status(error::OUT_OF_RANGE, "Read less bytes than requested");
+    }
+    n = getObjectOutcome.GetResult().GetContentLength();
+    std::stringstream ss;
+    ss << getObjectOutcome.GetResult().GetBody().rdbuf();
+    ss.read(scratch, n);
+
+    *result = StringPiece(scratch, n);
+    return Status::OK();
+  }
+
+ private:
+  string bucket_;
+  string object_;
+};
+
+class S3WritableFile : public WritableFile {
+ public:
+  S3WritableFile(const string& bucket, const string& object)
+      : bucket_(bucket),
+        object_(object),
+        sync_needed_(true),
+        outfile_(Aws::MakeShared<Aws::Utils::TempFile>(
+            kS3FileSystemAllocationTag, "/tmp/s3_filesystem_XXXXXX",
+            std::ios_base::binary | std::ios_base::trunc | std::ios_base::in |
+                std::ios_base::out)) {}
+
+  Status Append(const StringPiece& data) override {
+    if (!outfile_) {
+      return errors::FailedPrecondition(
+          "The internal temporary file is not writable.");
+    }
+    sync_needed_ = true;
+    outfile_->write(data.data(), data.size());
+    if (!outfile_->good()) {
+      return errors::Internal(
+          "Could not append to the internal temporary file.");
+    }
+    return Status::OK();
+  }
+
+  Status Close() override {
+    if (outfile_) {
+      TF_RETURN_IF_ERROR(Sync());
+      outfile_.reset();
+    }
+    return Status::OK();
+  }
+
+  Status Flush() override { return Sync(); }
+
+  Status Sync() override {
+    if (!outfile_) {
+      return errors::FailedPrecondition(
+          "The internal temporary file is not writable.");
+    }
+    if (!sync_needed_) {
+      return Status::OK();
+    }
+    Aws::Client::ClientConfiguration clientConfig = GetDefaultClientConfig();
+    clientConfig.connectTimeoutMs = 300000;
+    clientConfig.requestTimeoutMs = 600000;
+    Aws::S3::S3Client s3Client(clientConfig);
+    Aws::S3::Model::PutObjectRequest putObjectRequest;
+    putObjectRequest.WithBucket(bucket_.c_str()).WithKey(object_.c_str());
+    long offset = outfile_->tellp();
+    outfile_->seekg(0);
+    putObjectRequest.SetBody(outfile_);
+    putObjectRequest.SetContentLength(offset);
+    auto putObjectOutcome = s3Client.PutObject(putObjectRequest);
+    outfile_->clear();
+    outfile_->seekp(offset);
+    if (!putObjectOutcome.IsSuccess()) {
+      string error = strings::StrCat(
+          putObjectOutcome.GetError().GetExceptionName().c_str(), ": ",
+          putObjectOutcome.GetError().GetMessage().c_str());
+      return errors::Internal(error);
+    }
+    return Status::OK();
+  }
+
+ private:
+  string bucket_;
+  string object_;
+  bool sync_needed_;
+  std::shared_ptr<Aws::Utils::TempFile> outfile_;
+};
+
+class S3ReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
+ public:
+  S3ReadOnlyMemoryRegion(std::unique_ptr<char[]> data, uint64 length)
+      : data_(std::move(data)), length_(length) {}
+  const void* data() override { return reinterpret_cast<void*>(data_.get()); }
+  uint64 length() override { return length_; }
+
+ private:
+  std::unique_ptr<char[]> data_;
+  uint64 length_;
+};
+
+S3FileSystem::S3FileSystem() {
+  Aws::SDKOptions options;
+  options.loggingOptions.logLevel = Aws::Utils::Logging::LogLevel::Info;
+  options.cryptoOptions.sha256Factory_create_fn = []() {
+    return Aws::MakeShared<S3SHA256Factory>(S3CryptoAllocationTag);
+  };
+  options.cryptoOptions.sha256HMACFactory_create_fn = []() {
+    return Aws::MakeShared<S3SHA256HmacFactory>(S3CryptoAllocationTag);
+  };
+  Aws::InitAPI(options);
+}
+
+S3FileSystem::~S3FileSystem() {
+  Aws::SDKOptions options;
+  options.loggingOptions.logLevel = Aws::Utils::Logging::LogLevel::Info;
+  Aws::ShutdownAPI(options);
+}
+
+Status S3FileSystem::NewRandomAccessFile(
+    const string& fname, std::unique_ptr<RandomAccessFile>* result) {
+  string bucket, object;
+  TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
+  result->reset(new S3RandomAccessFile(bucket, object));
+  return Status::OK();
+}
+
+Status S3FileSystem::NewWritableFile(const string& fname,
+                                     std::unique_ptr<WritableFile>* result) {
+  string bucket, object;
+  TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
+  result->reset(new S3WritableFile(bucket, object));
+  return Status::OK();
+}
+
+Status S3FileSystem::NewAppendableFile(const string& fname,
+                                       std::unique_ptr<WritableFile>* result) {
+  std::unique_ptr<RandomAccessFile> reader;
+  TF_RETURN_IF_ERROR(NewRandomAccessFile(fname, &reader));
+  std::unique_ptr<char[]> buffer(new char[kS3ReadAppendableFileBufferSize]);
+  Status status;
+  uint64 offset = 0;
+  StringPiece read_chunk;
+
+  string bucket, object;
+  TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
+  result->reset(new S3WritableFile(bucket, object));
+
+  while (true) {
+    status = reader->Read(offset, kS3ReadAppendableFileBufferSize, &read_chunk,
+                          buffer.get());
+    if (status.ok()) {
+      (*result)->Append(read_chunk);
+      offset += kS3ReadAppendableFileBufferSize;
+    } else if (status.code() == error::OUT_OF_RANGE) {
+      (*result)->Append(read_chunk);
+      break;
+    } else {
+      (*result).reset();
+      return status;
+    }
+  }
+
+  return Status::OK();
+}
+
+Status S3FileSystem::NewReadOnlyMemoryRegionFromFile(
+    const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+  uint64 size;
+  TF_RETURN_IF_ERROR(GetFileSize(fname, &size));
+  std::unique_ptr<char[]> data(new char[size]);
+
+  std::unique_ptr<RandomAccessFile> file;
+  TF_RETURN_IF_ERROR(NewRandomAccessFile(fname, &file));
+
+  StringPiece piece;
+  TF_RETURN_IF_ERROR(file->Read(0, size, &piece, data.get()));
+
+  result->reset(new S3ReadOnlyMemoryRegion(std::move(data), size));
+  return Status::OK();
+}
+
+Status S3FileSystem::FileExists(const string& fname) {
+  FileStatistics stats;
+  TF_RETURN_IF_ERROR(this->Stat(fname, &stats));
+  return Status::OK();
+}
+
+Status S3FileSystem::GetChildren(const string& dir,
+                                 std::vector<string>* result) {
+  string bucket, prefix;
+  TF_RETURN_IF_ERROR(ParseS3Path(dir, false, &bucket, &prefix));
+
+  if (prefix.back() != '/') {
+    prefix.push_back('/');
+  }
+
+  Aws::S3::S3Client s3Client(GetDefaultClientConfig());
+  Aws::S3::Model::ListObjectsRequest listObjectsRequest;
+  listObjectsRequest.WithBucket(bucket.c_str())
+      .WithPrefix(prefix.c_str())
+      .WithMaxKeys(kS3GetChildrenMaxKeys)
+      .WithDelimiter("/");
+  listObjectsRequest.SetResponseStreamFactory(
+      []() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
+
+  Aws::S3::Model::ListObjectsResult listObjectsResult;
+  do {
+    auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest);
+    if (!listObjectsOutcome.IsSuccess()) {
+      string error = strings::StrCat(
+          listObjectsOutcome.GetError().GetExceptionName().c_str(), ": ",
+          listObjectsOutcome.GetError().GetMessage().c_str());
+      return errors::Internal(error);
+    }
+
+    listObjectsResult = listObjectsOutcome.GetResult();
+    for (const auto& object : listObjectsResult.GetCommonPrefixes()) {
+      Aws::String s = object.GetPrefix();
+      s.erase(s.length() - 1);
+      Aws::String entry = s.substr(strlen(prefix.c_str()));
+      if (entry.length() > 0) {
+        result->push_back(entry.c_str());
+      }
+    }
+    for (const auto& object : listObjectsResult.GetContents()) {
+      Aws::String s = object.GetKey();
+      Aws::String entry = s.substr(strlen(prefix.c_str()));
+      if (entry.length() > 0) {
+        result->push_back(entry.c_str());
+      }
+    }
+    listObjectsRequest.SetMarker(listObjectsResult.GetNextMarker());
+  } while (listObjectsResult.GetIsTruncated());
+
+  return Status::OK();
+}
+
+Status S3FileSystem::Stat(const string& fname, FileStatistics* stats) {
+  string bucket, object;
+  TF_RETURN_IF_ERROR(ParseS3Path(fname, true, &bucket, &object));
+
+  Aws::S3::S3Client s3Client(GetDefaultClientConfig());
+  if (object.empty()) {
+    Aws::S3::Model::HeadBucketRequest headBucketRequest;
+    headBucketRequest.WithBucket(bucket.c_str());
+    auto headBucketOutcome = s3Client.HeadBucket(headBucketRequest);
+    if (!headBucketOutcome.IsSuccess()) {
+      string error = strings::StrCat(
+          headBucketOutcome.GetError().GetExceptionName().c_str(), ": ",
+          headBucketOutcome.GetError().GetMessage().c_str());
+      return errors::Internal(error);
+    }
+    stats->length = 0;
+    stats->is_directory = 1;
+    return Status::OK();
+  }
+
+  bool found = false;
+
+  Aws::S3::Model::HeadObjectRequest headObjectRequest;
+  headObjectRequest.WithBucket(bucket.c_str()).WithKey(object.c_str());
+  headObjectRequest.SetResponseStreamFactory(
+      []() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
+  auto headObjectOutcome = s3Client.HeadObject(headObjectRequest);
+  if (headObjectOutcome.IsSuccess()) {
+    stats->length = headObjectOutcome.GetResult().GetContentLength();
+    stats->is_directory = 0;
+    stats->mtime_nsec =
+        headObjectOutcome.GetResult().GetLastModified().Millis() * 1e6;
+    found = true;
+  }
+  string prefix = object;
+  if (prefix.back() != '/') {
+    prefix.push_back('/');
+  }
+  Aws::S3::Model::ListObjectsRequest listObjectsRequest;
+  listObjectsRequest.WithBucket(bucket.c_str())
+      .WithPrefix(prefix.c_str())
+      .WithMaxKeys(1);
+  listObjectsRequest.SetResponseStreamFactory(
+      []() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
+  auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest);
+  if (listObjectsOutcome.IsSuccess()) {
+    if (listObjectsOutcome.GetResult().GetContents().size() > 0) {
+      stats->length = 0;
+      stats->is_directory = 1;
+      found = true;
+    }
+  }
+  if (!found) {
+    return errors::NotFound("Object ", fname, " does not exist");
+  }
+  return Status::OK();
+}
+
+Status S3FileSystem::DeleteFile(const string& fname) {
+  string bucket, object;
+  TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
+
+  Aws::S3::S3Client s3Client(GetDefaultClientConfig());
+  Aws::S3::Model::DeleteObjectRequest deleteObjectRequest;
+  deleteObjectRequest.WithBucket(bucket.c_str()).WithKey(object.c_str());
+
+  auto deleteObjectOutcome = s3Client.DeleteObject(deleteObjectRequest);
+  if (!deleteObjectOutcome.IsSuccess()) {
+    string error = strings::StrCat(
+        deleteObjectOutcome.GetError().GetExceptionName().c_str(), ": ",
+        deleteObjectOutcome.GetError().GetMessage().c_str());
+    return errors::Internal(error);
+  }
+  return Status::OK();
+}
+
+Status S3FileSystem::CreateDir(const string& dirname) {
+  string bucket, object;
+  TF_RETURN_IF_ERROR(ParseS3Path(dirname, true, &bucket, &object));
+
+  if (object.empty()) {
+    Aws::S3::S3Client s3Client(GetDefaultClientConfig());
+    Aws::S3::Model::HeadBucketRequest headBucketRequest;
+    headBucketRequest.WithBucket(bucket.c_str());
+    auto headBucketOutcome = s3Client.HeadBucket(headBucketRequest);
+    if (!headBucketOutcome.IsSuccess()) {
+      return errors::NotFound("The bucket ", bucket, " was not found.");
+    }
+    return Status::OK();
+  }
+  string filename = dirname;
+  if (filename.back() != '/') {
+    filename.push_back('/');
+  }
+  std::unique_ptr<WritableFile> file;
+  TF_RETURN_IF_ERROR(NewWritableFile(filename, &file));
+  TF_RETURN_IF_ERROR(file->Close());
+  return Status::OK();
+}
+
+Status S3FileSystem::DeleteDir(const string& dirname) {
+  string bucket, object;
+  TF_RETURN_IF_ERROR(ParseS3Path(dirname, false, &bucket, &object));
+
+  Aws::S3::S3Client s3Client(GetDefaultClientConfig());
+  string prefix = object;
+  if (prefix.back() != '/') {
+    prefix.push_back('/');
+  }
+  Aws::S3::Model::ListObjectsRequest listObjectsRequest;
+  listObjectsRequest.WithBucket(bucket.c_str())
+      .WithPrefix(prefix.c_str())
+      .WithMaxKeys(2);
+  listObjectsRequest.SetResponseStreamFactory(
+      []() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
+  auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest);
+  if (listObjectsOutcome.IsSuccess()) {
+    auto contents = listObjectsOutcome.GetResult().GetContents();
+    if (contents.size() > 1 ||
+        (contents.size() == 1 && contents[0].GetKey() != prefix.c_str())) {
+      return errors::FailedPrecondition("Cannot delete a non-empty directory.");
+    }
+    if (contents.size() == 1 && contents[0].GetKey() == prefix.c_str()) {
+      string filename = dirname;
+      if (filename.back() != '/') {
+        filename.push_back('/');
+      }
+      return DeleteFile(filename);
+    }
+  }
+  return Status::OK();
+}
+
+Status S3FileSystem::GetFileSize(const string& fname, uint64* file_size) {
+  FileStatistics stats;
+  TF_RETURN_IF_ERROR(this->Stat(fname, &stats));
+  *file_size = stats.length;
+  return Status::OK();
+}
+
+Status S3FileSystem::RenameFile(const string& src, const string& target) {
+  string src_bucket, src_object, target_bucket, target_object;
+  TF_RETURN_IF_ERROR(ParseS3Path(src, false, &src_bucket, &src_object));
+  TF_RETURN_IF_ERROR(
+      ParseS3Path(target, false, &target_bucket, &target_object));
+  if (src_object.back() == '/') {
+    if (target_object.back() != '/') {
+      target_object.push_back('/');
+    }
+  } else {
+    if (target_object.back() == '/') {
+      target_object.pop_back();
+    }
+  }
+
+  Aws::S3::S3Client s3Client(GetDefaultClientConfig());
+
+  Aws::S3::Model::CopyObjectRequest copyObjectRequest;
+  Aws::S3::Model::DeleteObjectRequest deleteObjectRequest;
+
+  Aws::S3::Model::ListObjectsRequest listObjectsRequest;
+  listObjectsRequest.WithBucket(src_bucket.c_str())
+      .WithPrefix(src_object.c_str())
+      .WithMaxKeys(kS3GetChildrenMaxKeys);
+  listObjectsRequest.SetResponseStreamFactory(
+      []() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
+
+  Aws::S3::Model::ListObjectsResult listObjectsResult;
+  do {
+    auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest);
+    if (!listObjectsOutcome.IsSuccess()) {
+      string error = strings::StrCat(
+          listObjectsOutcome.GetError().GetExceptionName().c_str(), ": ",
+          listObjectsOutcome.GetError().GetMessage().c_str());
+      return errors::Internal(error);
+    }
+
+    listObjectsResult = listObjectsOutcome.GetResult();
+    for (const auto& object : listObjectsResult.GetContents()) {
+      Aws::String src_key = object.GetKey();
+      Aws::String target_key = src_key;
+      target_key.replace(0, src_object.length(), target_object.c_str());
+      Aws::String source = Aws::String(src_bucket.c_str()) + "/" + src_key;
+
+      copyObjectRequest.SetBucket(target_bucket.c_str());
+      copyObjectRequest.SetKey(target_key);
+      copyObjectRequest.SetCopySource(source);
+
+      auto copyObjectOutcome = s3Client.CopyObject(copyObjectRequest);
+      if (!copyObjectOutcome.IsSuccess()) {
+        string error = strings::StrCat(
+            copyObjectOutcome.GetError().GetExceptionName().c_str(), ": ",
+            copyObjectOutcome.GetError().GetMessage().c_str());
+        return errors::Internal(error);
+      }
+
+      deleteObjectRequest.SetBucket(src_bucket.c_str());
+      deleteObjectRequest.SetKey(src_key.c_str());
+
+      auto deleteObjectOutcome = s3Client.DeleteObject(deleteObjectRequest);
+      if (!deleteObjectOutcome.IsSuccess()) {
+        string error = strings::StrCat(
+            deleteObjectOutcome.GetError().GetExceptionName().c_str(), ": ",
+            deleteObjectOutcome.GetError().GetMessage().c_str());
+        return errors::Internal(error);
+      }
+    }
+    listObjectsRequest.SetMarker(listObjectsResult.GetNextMarker());
+  } while (listObjectsResult.GetIsTruncated());
+
+  return Status::OK();
+}
+
+REGISTER_FILE_SYSTEM("s3", S3FileSystem);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/s3/s3_file_system.h b/tensorflow/contrib/s3/s3_file_system.h
new file mode 100644
index 0000000000..31ba3cecc5
--- /dev/null
+++ b/tensorflow/contrib/s3/s3_file_system.h
@@ -0,0 +1,60 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_S3_S3_FILE_SYSTEM_H_
+#define TENSORFLOW_CONTRIB_S3_S3_FILE_SYSTEM_H_
+
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+class S3FileSystem : public FileSystem {
+ public:
+  S3FileSystem();
+  ~S3FileSystem();
+
+  Status NewRandomAccessFile(
+      const string& fname, std::unique_ptr<RandomAccessFile>* result) override;
+
+  Status NewWritableFile(const string& fname,
+                         std::unique_ptr<WritableFile>* result) override;
+
+  Status NewAppendableFile(const string& fname,
+                           std::unique_ptr<WritableFile>* result) override;
+
+  Status NewReadOnlyMemoryRegionFromFile(
+      const string& fname,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
+
+  Status FileExists(const string& fname) override;
+
+  Status GetChildren(const string& dir, std::vector<string>* result) override;
+
+  Status Stat(const string& fname, FileStatistics* stat) override;
+
+  Status DeleteFile(const string& fname) override;
+
+  Status CreateDir(const string& name) override;
+
+  Status DeleteDir(const string& name) override;
+
+  Status GetFileSize(const string& fname, uint64* size) override;
+
+  Status RenameFile(const string& src, const string& target) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_S3_S3_FILE_SYSTEM_H_
diff --git a/tensorflow/contrib/s3/s3_file_system_test.cc b/tensorflow/contrib/s3/s3_file_system_test.cc
new file mode 100644
index 0000000000..949281fad4
--- /dev/null
+++ b/tensorflow/contrib/s3/s3_file_system_test.cc
@@ -0,0 +1,233 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/s3/s3_file_system.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+namespace {
+
+class S3FileSystemTest : public ::testing::Test {
+ protected:
+  S3FileSystemTest() {}
+
+  string TmpDir(const string& path) {
+    char* test_dir = getenv("S3_TEST_TMPDIR");
+    if (test_dir != nullptr) {
+      return io::JoinPath(string(test_dir), path);
+    } else {
+      return "s3://" + io::JoinPath(testing::TmpDir(), path);
+    }
+  }
+
+  Status WriteString(const string& fname, const string& content) {
+    std::unique_ptr<WritableFile> writer;
+    TF_RETURN_IF_ERROR(s3fs.NewWritableFile(fname, &writer));
+    TF_RETURN_IF_ERROR(writer->Append(content));
+    TF_RETURN_IF_ERROR(writer->Close());
+    return Status::OK();
+  }
+
+  Status ReadAll(const string& fname, string* content) {
+    std::unique_ptr<RandomAccessFile> reader;
+    TF_RETURN_IF_ERROR(s3fs.NewRandomAccessFile(fname, &reader));
+
+    uint64 file_size = 0;
+    TF_RETURN_IF_ERROR(s3fs.GetFileSize(fname, &file_size));
+
+    content->resize(file_size);
+    StringPiece result;
+    TF_RETURN_IF_ERROR(
+        reader->Read(0, file_size, &result, gtl::string_as_array(content)));
+    if (file_size != result.size()) {
+      return errors::DataLoss("expected ", file_size, " got ", result.size(),
+                              " bytes");
+    }
+    return Status::OK();
+  }
+
+  S3FileSystem s3fs;
+};
+
+TEST_F(S3FileSystemTest, NewRandomAccessFile) {
+  const string fname = TmpDir("RandomAccessFile");
+  const string content = "abcdefghijklmn";
+
+  TF_ASSERT_OK(WriteString(fname, content));
+
+  std::unique_ptr<RandomAccessFile> reader;
+  TF_EXPECT_OK(s3fs.NewRandomAccessFile(fname, &reader));
+
+  string got;
+  got.resize(content.size());
+  StringPiece result;
+  TF_EXPECT_OK(
+      reader->Read(0, content.size(), &result, gtl::string_as_array(&got)));
+  EXPECT_EQ(content.size(), result.size());
+  EXPECT_EQ(content, result);
+
+  got.clear();
+  got.resize(4);
+  TF_EXPECT_OK(reader->Read(2, 4, &result, gtl::string_as_array(&got)));
+  EXPECT_EQ(4, result.size());
+  EXPECT_EQ(content.substr(2, 4), result);
+}
+
+TEST_F(S3FileSystemTest, NewWritableFile) {
+  std::unique_ptr<WritableFile> writer;
+  const string fname = TmpDir("WritableFile");
+  TF_EXPECT_OK(s3fs.NewWritableFile(fname, &writer));
+  TF_EXPECT_OK(writer->Append("content1,"));
+  TF_EXPECT_OK(writer->Append("content2"));
+  TF_EXPECT_OK(writer->Flush());
+  TF_EXPECT_OK(writer->Sync());
+  TF_EXPECT_OK(writer->Close());
+
+  string content;
+  TF_EXPECT_OK(ReadAll(fname, &content));
+  EXPECT_EQ("content1,content2", content);
+}
+
+TEST_F(S3FileSystemTest, NewAppendableFile) {
+  std::unique_ptr<WritableFile> writer;
+
+  const string fname = TmpDir("AppendableFile");
+  TF_ASSERT_OK(WriteString(fname, "test"));
+
+  TF_EXPECT_OK(s3fs.NewAppendableFile(fname, &writer));
+  TF_EXPECT_OK(writer->Append("content"));
+  TF_EXPECT_OK(writer->Close());
+}
+
+TEST_F(S3FileSystemTest, NewReadOnlyMemoryRegionFromFile) {
+  const string fname = TmpDir("MemoryFile");
+  const string content = "content";
+  TF_ASSERT_OK(WriteString(fname, content));
+  std::unique_ptr<ReadOnlyMemoryRegion> region;
+  TF_EXPECT_OK(s3fs.NewReadOnlyMemoryRegionFromFile(fname, &region));
+
+  EXPECT_EQ(content, StringPiece(reinterpret_cast<const char*>(region->data()),
+                                 region->length()));
+}
+
+TEST_F(S3FileSystemTest, FileExists) {
+  const string fname = TmpDir("FileExists");
+  EXPECT_EQ(error::Code::NOT_FOUND, s3fs.FileExists(fname).code());
+  TF_ASSERT_OK(WriteString(fname, "test"));
+  TF_EXPECT_OK(s3fs.FileExists(fname));
+}
+
+TEST_F(S3FileSystemTest, GetChildren) {
+  const string base = TmpDir("GetChildren");
+  TF_EXPECT_OK(s3fs.CreateDir(base));
+
+  const string file = io::JoinPath(base, "TestFile.csv");
+  TF_EXPECT_OK(WriteString(file, "test"));
+
+  const string subdir = io::JoinPath(base, "SubDir");
+  TF_EXPECT_OK(s3fs.CreateDir(subdir));
+  // s3 object storage doesn't support empty directory, we create file in the
+  // directory
+  const string subfile = io::JoinPath(subdir, "TestSubFile.csv");
+  TF_EXPECT_OK(WriteString(subfile, "test"));
+
+  std::vector<string> children;
+  TF_EXPECT_OK(s3fs.GetChildren(base, &children));
+  std::sort(children.begin(), children.end());
+  EXPECT_EQ(std::vector<string>({"SubDir", "TestFile.csv"}), children);
+}
+
+TEST_F(S3FileSystemTest, DeleteFile) {
+  const string fname = TmpDir("DeleteFile");
+  TF_ASSERT_OK(WriteString(fname, "test"));
+  TF_EXPECT_OK(s3fs.DeleteFile(fname));
+}
+
+TEST_F(S3FileSystemTest, GetFileSize) {
+  const string fname = TmpDir("GetFileSize");
+  TF_ASSERT_OK(WriteString(fname, "test"));
+  uint64 file_size = 0;
+  TF_EXPECT_OK(s3fs.GetFileSize(fname, &file_size));
+  EXPECT_EQ(4, file_size);
+}
+
+TEST_F(S3FileSystemTest, CreateDir) {
+  // s3 object storage doesn't support empty directory, we create file in the
+  // directory
+  const string dir = TmpDir("CreateDir");
+  TF_EXPECT_OK(s3fs.CreateDir(dir));
+
+  const string file = io::JoinPath(dir, "CreateDirFile.csv");
+  TF_EXPECT_OK(WriteString(file, "test"));
+  FileStatistics stat;
+  TF_EXPECT_OK(s3fs.Stat(dir, &stat));
+  EXPECT_TRUE(stat.is_directory);
+}
+
+TEST_F(S3FileSystemTest, DeleteDir) {
+  // s3 object storage doesn't support empty directory, we create file in the
+  // directory
+  const string dir = TmpDir("DeleteDir");
+  const string file = io::JoinPath(dir, "DeleteDirFile.csv");
+  TF_EXPECT_OK(WriteString(file, "test"));
+  EXPECT_FALSE(s3fs.DeleteDir(dir).ok());
+
+  TF_EXPECT_OK(s3fs.DeleteFile(file));
+  TF_EXPECT_OK(s3fs.DeleteDir(dir));
+  FileStatistics stat;
+  EXPECT_FALSE(s3fs.Stat(dir, &stat).ok());
+}
+
+TEST_F(S3FileSystemTest, RenameFile) {
+  const string fname1 = TmpDir("RenameFile1");
+  const string fname2 = TmpDir("RenameFile2");
+  TF_ASSERT_OK(WriteString(fname1, "test"));
+  TF_EXPECT_OK(s3fs.RenameFile(fname1, fname2));
+  string content;
+  TF_EXPECT_OK(ReadAll(fname2, &content));
+  EXPECT_EQ("test", content);
+}
+
+TEST_F(S3FileSystemTest, RenameFile_Overwrite) {
+  const string fname1 = TmpDir("RenameFile1");
+  const string fname2 = TmpDir("RenameFile2");
+
+  TF_ASSERT_OK(WriteString(fname2, "test"));
+  TF_EXPECT_OK(s3fs.FileExists(fname2));
+
+  TF_ASSERT_OK(WriteString(fname1, "test"));
+  TF_EXPECT_OK(s3fs.RenameFile(fname1, fname2));
+  string content;
+  TF_EXPECT_OK(ReadAll(fname2, &content));
+  EXPECT_EQ("test", content);
+}
+
+TEST_F(S3FileSystemTest, StatFile) {
+  const string fname = TmpDir("StatFile");
+  TF_ASSERT_OK(WriteString(fname, "test"));
+  FileStatistics stat;
+  TF_EXPECT_OK(s3fs.Stat(fname, &stat));
+  EXPECT_EQ(4, stat.length);
+  EXPECT_FALSE(stat.is_directory);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/infeed_ops.cc b/tensorflow/contrib/tpu/ops/infeed_ops.cc
index c12e83137a..849c4a1102 100644
--- a/tensorflow/contrib/tpu/ops/infeed_ops.cc
+++ b/tensorflow/contrib/tpu/ops/infeed_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 
@@ -26,14 +27,7 @@ REGISTER_OP("InfeedDequeue")
     .Attr("dtype: type")
     .Attr("shape: shape")
     .SetIsStateful()
-    .SetShapeFn([](InferenceContext* c) {
-      PartialTensorShape shape;
-      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
-      ShapeHandle out;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &out));
-      c->set_output(0, out);
-      return Status::OK();
-    })
+    .SetShapeFn(shape_inference::ExplicitShape)
     .Doc(R"doc(
 A placeholder op for a value that will be fed into the computation.
 
diff --git a/tensorflow/contrib/tpu/ops/outfeed_ops.cc b/tensorflow/contrib/tpu/ops/outfeed_ops.cc
index 16c57a1c2b..ed5756cc54 100644
--- a/tensorflow/contrib/tpu/ops/outfeed_ops.cc
+++ b/tensorflow/contrib/tpu/ops/outfeed_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 
@@ -48,14 +49,7 @@ REGISTER_OP("OutfeedDequeue")
     .Attr("shape: shape")
     .Attr("device_ordinal: int = -1")
     .SetIsStateful()
-    .SetShapeFn([](InferenceContext* c) {
-      PartialTensorShape shape;
-      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
-      ShapeHandle out;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &out));
-      c->set_output(0, out);
-      return Status::OK();
-    })
+    .SetShapeFn(shape_inference::ExplicitShape)
     .Doc(R"doc(
 Retrieves a single tensor from the computation outfeed.  This operation will
 block indefinitely until data is available.
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 9e0f6d3be1..d75280dd5c 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -1200,6 +1200,15 @@ Status ScatterNdUpdateShape(InferenceContext* c) {
   return Status::OK();
 }
 
+Status ExplicitShape(InferenceContext* c) {
+  PartialTensorShape shape;
+  TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
+  ShapeHandle output_shape;
+  TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &output_shape));
+  c->set_output(0, output_shape);
+  return Status::OK();
+}
+
 }  // namespace shape_inference
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index a3c79afc0b..aef3405bc5 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -213,6 +213,9 @@ Status ValidateSparseTensor(InferenceContext* c, ShapeHandle indices_shape,
 // Shape function for ScatterNd update/add/sub/... operations.
 Status ScatterNdUpdateShape(InferenceContext* c);
 
+// Shape function for ops with an explicit "shape" attribute.
+Status ExplicitShape(InferenceContext* c);
+
 }  // namespace shape_inference
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index df84d9d528..a5b5ef0acc 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -451,7 +451,7 @@ Buffer<T>::~Buffer() {
 // default value for T.
 //
 // This routine is using the typed fields (float_val, etc.) in the
-// tenor proto as opposed to the untyped binary representation
+// tensor proto as opposed to the untyped binary representation
 // (tensor_content). This is used when we expect the TensorProto is
 // used by a client program which may not know how to encode a tensor
 // in the compact binary representation.
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index 6577562d4e..1a6fad4182 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -281,8 +281,8 @@ Status SingleMachine::ResetSession() {
     // Make sure the session is properly closed
     TF_RETURN_IF_ERROR(Shutdown());
 
-    // Destroying the object deletes all its varibles as well. This is only true
-    // for DirectSession.
+    // Destroying the object deletes all its variables as well. This is only
+    // true for DirectSession.
     session_.reset();
   }
 
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 386c0c7914..fbafed7c1f 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -26,8 +26,8 @@ namespace grappler {
 
 constexpr int kOpsPerMac = 2;
 constexpr char kConv2d[] = "Conv2D";
-constexpr char kConv2dBackPropFilter[] = "Conv2DBackpropFilter";
-constexpr char kConv2dBackPropInput[] = "Conv2DBackpropInput";
+constexpr char kConv2dBackpropFilter[] = "Conv2DBackpropFilter";
+constexpr char kConv2dBackpropInput[] = "Conv2DBackpropInput";
 constexpr char kMatMul[] = "MatMul";
 constexpr char kSparseMatMul[] = "SparseMatMul";
 constexpr char kIdentity[] = "Identity";
@@ -150,10 +150,10 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
 
   device_cost_impl_ = {
       {kConv2d, wrap(&OpLevelCostEstimator::PredictConv2D)},
-      {kConv2dBackPropFilter,
-       wrap(&OpLevelCostEstimator::PredictConv2DBackPropFilter)},
-      {kConv2dBackPropInput,
-       wrap(&OpLevelCostEstimator::PredictConv2DBackPropInput)},
+      {kConv2dBackpropFilter,
+       wrap(&OpLevelCostEstimator::PredictConv2DBackpropFilter)},
+      {kConv2dBackpropInput,
+       wrap(&OpLevelCostEstimator::PredictConv2DBackpropInput)},
       {kMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kIdentity, wrap(&OpLevelCostEstimator::PredictNoOp)},
@@ -668,20 +668,20 @@ int64 OpLevelCostEstimator::CountBatchMatMulOperations(
   return ops;
 }
 
-// TODO(cliffy): Dedup this method and CountConv2DBackPropFilterOperations.
-int64 OpLevelCostEstimator::CountConv2DBackPropInputOperations(
+// TODO(cliffy): Dedup this method and CountConv2DBackpropFilterOperations.
+int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
     const OpInfo& op_features, ConvolutionDimensions* returned_conv_dims,
     bool* found_unknown_shapes) const {
   int64 ops = 0;
 
-  if (op_features.op() != kConv2dBackPropInput) {
+  if (op_features.op() != kConv2dBackpropInput) {
     LOG(ERROR) << "Invalid Operation";
     return ops;
   }
 
   if (op_features.outputs_size() != 1) {
     // Need _output_shapes for input shape.
-    LOG(ERROR) << "No output shape in Conv2DBackPropInput op.";
+    LOG(ERROR) << "No output shape in Conv2DBackpropInput op.";
     return ops;
   }
 
@@ -696,7 +696,7 @@ int64 OpLevelCostEstimator::CountConv2DBackPropInputOperations(
   ops *= conv_dims.iz * conv_dims.oz;
   ops *= kOpsPerMac;
 
-  VLOG(1) << "Operations for Conv2DBackPropInput " << ops;
+  VLOG(1) << "Operations for Conv2DBackpropInput " << ops;
 
   if (returned_conv_dims != nullptr) {
     *returned_conv_dims = conv_dims;
@@ -704,18 +704,18 @@ int64 OpLevelCostEstimator::CountConv2DBackPropInputOperations(
   return ops;
 }
 
-int64 OpLevelCostEstimator::CountConv2DBackPropFilterOperations(
+int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations(
     const OpInfo& op_features, ConvolutionDimensions* returned_conv_dims,
     bool* found_unknown_shapes) const {
   int64 ops = 0;
-  if (op_features.op() != kConv2dBackPropFilter) {
+  if (op_features.op() != kConv2dBackpropFilter) {
     LOG(ERROR) << "Invalid Operation";
     return ops;
   }
 
   if (op_features.outputs_size() != 1) {
     // Need _output_shapes for input shape.
-    LOG(ERROR) << "No output shape in Conv2DBackPropFilter op.";
+    LOG(ERROR) << "No output shape in Conv2DBackpropFilter op.";
     return ops;
   }
 
@@ -730,7 +730,7 @@ int64 OpLevelCostEstimator::CountConv2DBackPropFilterOperations(
   ops *= conv_dims.iz * conv_dims.oz;
   ops *= kOpsPerMac;
 
-  VLOG(1) << "Operations for Conv2DBackPropFilter" << ops;
+  VLOG(1) << "Operations for Conv2DBackpropFilter" << ops;
 
   if (returned_conv_dims != nullptr) {
     *returned_conv_dims = conv_dims;
@@ -814,22 +814,22 @@ Costs OpLevelCostEstimator::PredictConv2D(const OpInfo& op_features) const {
   return costs;
 }
 
-Costs OpLevelCostEstimator::PredictConv2DBackPropInput(
+Costs OpLevelCostEstimator::PredictConv2DBackpropInput(
     const OpInfo& op_features) const {
   bool found_unknown_shapes = false;
   auto costs =
-      PredictOpCountBasedCost(CountConv2DBackPropInputOperations(
+      PredictOpCountBasedCost(CountConv2DBackpropInputOperations(
                                   op_features, nullptr, &found_unknown_shapes),
                               op_features);
   costs.inaccurate = found_unknown_shapes;
   return costs;
 }
 
-Costs OpLevelCostEstimator::PredictConv2DBackPropFilter(
+Costs OpLevelCostEstimator::PredictConv2DBackpropFilter(
     const OpInfo& op_features) const {
   bool found_unknown_shapes = false;
   auto costs =
-      PredictOpCountBasedCost(CountConv2DBackPropFilterOperations(
+      PredictOpCountBasedCost(CountConv2DBackpropFilterOperations(
                                   op_features, nullptr, &found_unknown_shapes),
                               op_features);
   costs.inaccurate = found_unknown_shapes;
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index 36ef6a5c61..b4302dc9e1 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -82,10 +82,10 @@ class OpLevelCostEstimator {
                               bool* found_unknown_shapes) const;
   int64 CountBatchMatMulOperations(const OpInfo& op_features,
                                    bool* found_unknown_shapes) const;
-  int64 CountConv2DBackPropInputOperations(const OpInfo& op_features,
+  int64 CountConv2DBackpropInputOperations(const OpInfo& op_features,
                                            ConvolutionDimensions* conv_info,
                                            bool* found_unknown_shapes) const;
-  int64 CountConv2DBackPropFilterOperations(const OpInfo& op_features,
+  int64 CountConv2DBackpropFilterOperations(const OpInfo& op_features,
                                             ConvolutionDimensions* conv_info,
                                             bool* found_unknown_shapes) const;
 
@@ -124,8 +124,8 @@ class OpLevelCostEstimator {
   // device.
   Costs PredictConv2D(const OpInfo& op_features) const;
   Costs PredictCwiseOp(const OpInfo& op_features) const;
-  Costs PredictConv2DBackPropInput(const OpInfo& op_features) const;
-  Costs PredictConv2DBackPropFilter(const OpInfo& op_features) const;
+  Costs PredictConv2DBackpropInput(const OpInfo& op_features) const;
+  Costs PredictConv2DBackpropFilter(const OpInfo& op_features) const;
   Costs PredictMatMul(const OpInfo& op_features) const;
   Costs PredictNoOp(const OpInfo& op_features) const;
   Costs PredictBatchMatMul(const OpInfo& op_features) const;
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 93e28bfb38..a08e2f5ee3 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -4429,6 +4429,7 @@ filegroup(
         "depthtospace_op.h",
         "depthwise_conv_op.h",
         "fake_quant_ops_functor.h",
+        "fused_batch_norm_op.h",
         "gemm_functors.h",
         "image_resizer_state.h",
         "maxpooling_op.h",
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 8eb705b2e5..641077ca65 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -555,7 +555,7 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
     int col_stride, const Padding& padding, Tensor* filter_backprop,
     TensorFormat data_format) {
   using perftools::gputools::dnn::AlgorithmConfig;
-  using perftools::gputools::dnn::AlgorithmType;
+  using perftools::gputools::dnn::AlgorithmDesc;
   using perftools::gputools::dnn::ProfileResult;
 
   std::vector<int32> strides(4, 1);
@@ -816,35 +816,40 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
   AlgorithmConfig algorithm_config;
   if (cudnn_use_autotune && !AutoTuneConvBwdFilter::GetInstance()->Find(
                                 conv_parameters, &algorithm_config)) {
-    std::vector<AlgorithmType> algorithms;
+    std::vector<AlgorithmDesc::Index> algorithms;
     CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms(
         conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
     ProfileResult best_result;
     ProfileResult best_result_no_scratch;
-    for (auto profile_algorithm : algorithms) {
-      // TODO(zhengxq): profile each algorithm multiple times to better
-      // accuracy.
-      CudnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
-                                              ctx);
-      ProfileResult profile_result;
-      bool cudnn_launch_status =
-          stream
-              ->ThenConvolveBackwardFilterWithAlgorithm(
-                  input_desc, input_ptr, output_desc, out_backprop_ptr,
-                  conv_desc, filter_desc, &filter_backprop_ptr,
-                  &scratch_allocator, AlgorithmConfig(profile_algorithm),
-                  &profile_result)
-              .ok();
-      if (cudnn_launch_status) {
-        if (profile_result.is_valid()) {
-          if (profile_result.elapsed_time_in_ms() <
-              best_result.elapsed_time_in_ms()) {
-            best_result = profile_result;
-          }
-          if (scratch_allocator.TotalByteSize() == 0 &&
-              profile_result.elapsed_time_in_ms() <
-                  best_result_no_scratch.elapsed_time_in_ms()) {
-            best_result_no_scratch = profile_result;
+    // TODO(benbarsdell): Ideally this should not attempt using tensor op math
+    // if it's not enabled.
+    for (bool use_tensor_ops : {false, true}) {
+      for (auto algo_index : algorithms) {
+        // TODO(zhengxq): profile each algorithm multiple times to better
+        // accuracy.
+        AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
+        CudnnScratchAllocator scratch_allocator(
+            ConvolveBackwardFilterScratchSize, ctx);
+        ProfileResult profile_result;
+        bool cudnn_launch_status =
+            stream
+                ->ThenConvolveBackwardFilterWithAlgorithm(
+                    input_desc, input_ptr, output_desc, out_backprop_ptr,
+                    conv_desc, filter_desc, &filter_backprop_ptr,
+                    &scratch_allocator, AlgorithmConfig(profile_algorithm),
+                    &profile_result)
+                .ok();
+        if (cudnn_launch_status) {
+          if (profile_result.is_valid()) {
+            if (profile_result.elapsed_time_in_ms() <
+                best_result.elapsed_time_in_ms()) {
+              best_result = profile_result;
+            }
+            if (scratch_allocator.TotalByteSize() == 0 &&
+                profile_result.elapsed_time_in_ms() <
+                    best_result_no_scratch.elapsed_time_in_ms()) {
+              best_result_no_scratch = profile_result;
+            }
           }
         }
       }
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index ce561aa99c..0732bf4046 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -630,7 +630,7 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     int col_stride, const Padding& padding, Tensor* in_backprop,
     TensorFormat data_format) {
   using perftools::gputools::dnn::AlgorithmConfig;
-  using perftools::gputools::dnn::AlgorithmType;
+  using perftools::gputools::dnn::AlgorithmDesc;
   using perftools::gputools::dnn::ProfileResult;
 
   std::vector<int32> strides(4, 1);
@@ -870,34 +870,39 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
   AlgorithmConfig algorithm_config;
   if (cudnn_use_autotune && !AutoTuneConvBwdData::GetInstance()->Find(
                                 conv_parameters, &algorithm_config)) {
-    std::vector<AlgorithmType> algorithms;
+    std::vector<AlgorithmDesc::Index> algorithms;
     CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms(
         conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
     ProfileResult best_result;
     ProfileResult best_result_no_scratch;
-    for (auto profile_algorithm : algorithms) {
-      // TODO(zhengxq): profile each algorithm multiple times to better
-      // accuracy.
-      CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
-                                              ctx);
-      ProfileResult profile_result;
-      bool cudnn_launch_status =
-          stream
-              ->ThenConvolveBackwardDataWithAlgorithm(
-                  filter_desc, filter_ptr, output_desc, out_backprop_ptr,
-                  conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator,
-                  AlgorithmConfig(profile_algorithm), &profile_result)
-              .ok();
-      if (cudnn_launch_status) {
-        if (profile_result.is_valid()) {
-          if (profile_result.elapsed_time_in_ms() <
-              best_result.elapsed_time_in_ms()) {
-            best_result = profile_result;
-          }
-          if (scratch_allocator.TotalByteSize() == 0 &&
-              profile_result.elapsed_time_in_ms() <
-                  best_result_no_scratch.elapsed_time_in_ms()) {
-            best_result_no_scratch = profile_result;
+    // TODO(benbarsdell): Ideally this should not attempt using tensor op math
+    // if it's not enabled.
+    for (bool use_tensor_ops : {false, true}) {
+      for (auto algo_index : algorithms) {
+        // TODO(zhengxq): profile each algorithm multiple times to better
+        // accuracy.
+        AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
+        CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
+                                                ctx);
+        ProfileResult profile_result;
+        bool cudnn_launch_status =
+            stream
+                ->ThenConvolveBackwardDataWithAlgorithm(
+                    filter_desc, filter_ptr, output_desc, out_backprop_ptr,
+                    conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator,
+                    AlgorithmConfig(profile_algorithm), &profile_result)
+                .ok();
+        if (cudnn_launch_status) {
+          if (profile_result.is_valid()) {
+            if (profile_result.elapsed_time_in_ms() <
+                best_result.elapsed_time_in_ms()) {
+              best_result = profile_result;
+            }
+            if (scratch_allocator.TotalByteSize() == 0 &&
+                profile_result.elapsed_time_in_ms() <
+                    best_result_no_scratch.elapsed_time_in_ms()) {
+              best_result_no_scratch = profile_result;
+            }
           }
         }
       }
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index cdb4b602ad..8ad56053a8 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -649,40 +649,45 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
     };
 
     using perftools::gputools::dnn::AlgorithmConfig;
-    using perftools::gputools::dnn::AlgorithmType;
+    using perftools::gputools::dnn::AlgorithmDesc;
     using perftools::gputools::dnn::ProfileResult;
-    using perftools::gputools::dnn::kDefaultAlgorithm;
     AlgorithmConfig algorithm_config;
     if (cudnn_use_autotune_ && !AutoTuneConv3dBwdData::GetInstance()->Find(
                                    conv_parameters, &algorithm_config)) {
-      std::vector<AlgorithmType> algorithms;
+      std::vector<AlgorithmDesc::Index> algorithms;
       CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms(
           conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
       ProfileResult best_result;
       ProfileResult best_result_no_scratch;
-      for (auto profile_algorithm : algorithms) {
-        // TODO(zhengxq): profile each algorithm multiple times to better
-        // accuracy.
-        CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
-                                                context);
-        ProfileResult profile_result;
-        bool cudnn_launch_status =
-            stream
-                ->ThenConvolveBackwardDataWithAlgorithm(
-                    filter_desc, filter_ptr, output_desc, out_backprop_ptr,
-                    conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator,
-                    AlgorithmConfig(profile_algorithm), &profile_result)
-                .ok();
-        if (cudnn_launch_status) {
-          if (profile_result.is_valid()) {
-            if (profile_result.elapsed_time_in_ms() <
-                best_result.elapsed_time_in_ms()) {
-              best_result = profile_result;
-            }
-            if (scratch_allocator.TotalByteSize() == 0 &&
-                profile_result.elapsed_time_in_ms() <
-                    best_result_no_scratch.elapsed_time_in_ms()) {
-              best_result_no_scratch = profile_result;
+      // TODO(benbarsdell): Ideally this should not attempt using tensor op math
+      // if it's not enabled.
+      for (bool use_tensor_ops : {false, true}) {
+        for (auto algo_index : algorithms) {
+          // TODO(zhengxq): profile each algorithm multiple times to better
+          // accuracy.
+          AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
+          CudnnScratchAllocator scratch_allocator(
+              ConvolveBackwardDataScratchSize, context);
+          ProfileResult profile_result;
+          bool cudnn_launch_status =
+              stream
+                  ->ThenConvolveBackwardDataWithAlgorithm(
+                      filter_desc, filter_ptr, output_desc, out_backprop_ptr,
+                      conv_desc, input_desc, &in_backprop_ptr,
+                      &scratch_allocator, AlgorithmConfig(profile_algorithm),
+                      &profile_result)
+                  .ok();
+          if (cudnn_launch_status) {
+            if (profile_result.is_valid()) {
+              if (profile_result.elapsed_time_in_ms() <
+                  best_result.elapsed_time_in_ms()) {
+                best_result = profile_result;
+              }
+              if (scratch_allocator.TotalByteSize() == 0 &&
+                  profile_result.elapsed_time_in_ms() <
+                      best_result_no_scratch.elapsed_time_in_ms()) {
+                best_result_no_scratch = profile_result;
+              }
             }
           }
         }
@@ -1016,41 +1021,45 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
     };
 
     using perftools::gputools::dnn::AlgorithmConfig;
-    using perftools::gputools::dnn::AlgorithmType;
+    using perftools::gputools::dnn::AlgorithmDesc;
     using perftools::gputools::dnn::ProfileResult;
-    using perftools::gputools::dnn::kDefaultAlgorithm;
     AlgorithmConfig algorithm_config;
     if (cudnn_use_autotune_ && !AutoTuneConv3dBwdFilter::GetInstance()->Find(
                                    conv_parameters, &algorithm_config)) {
-      std::vector<AlgorithmType> algorithms;
+      std::vector<AlgorithmDesc::Index> algorithms;
       CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms(
           conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
       ProfileResult best_result;
       ProfileResult best_result_no_scratch;
-      for (auto profile_algorithm : algorithms) {
-        // TODO(zhengxq): profile each algorithm multiple times to better
-        // accuracy.
-        CudnnScratchAllocator scratch_allocator(
-            ConvolveBackwardFilterScratchSize, context);
-        ProfileResult profile_result;
-        bool cudnn_launch_status =
-            stream
-                ->ThenConvolveBackwardFilterWithAlgorithm(
-                    input_desc, input_ptr, output_desc, out_backprop_ptr,
-                    conv_desc, filter_desc, &filter_backprop_ptr,
-                    &scratch_allocator, AlgorithmConfig(profile_algorithm),
-                    &profile_result)
-                .ok();
-        if (cudnn_launch_status) {
-          if (profile_result.is_valid()) {
-            if (profile_result.elapsed_time_in_ms() <
-                best_result.elapsed_time_in_ms()) {
-              best_result = profile_result;
-            }
-            if (scratch_allocator.TotalByteSize() == 0 &&
-                profile_result.elapsed_time_in_ms() <
-                    best_result_no_scratch.elapsed_time_in_ms()) {
-              best_result_no_scratch = profile_result;
+      // TODO(benbarsdell): Ideally this should not attempt using tensor op math
+      //                      if it's not enabled.
+      for (bool use_tensor_ops : {false, true}) {
+        for (auto algo_index : algorithms) {
+          // TODO(zhengxq): profile each algorithm multiple times to better
+          // accuracy.
+          AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
+          CudnnScratchAllocator scratch_allocator(
+              ConvolveBackwardFilterScratchSize, context);
+          ProfileResult profile_result;
+          bool cudnn_launch_status =
+              stream
+                  ->ThenConvolveBackwardFilterWithAlgorithm(
+                      input_desc, input_ptr, output_desc, out_backprop_ptr,
+                      conv_desc, filter_desc, &filter_backprop_ptr,
+                      &scratch_allocator, AlgorithmConfig(profile_algorithm),
+                      &profile_result)
+                  .ok();
+          if (cudnn_launch_status) {
+            if (profile_result.is_valid()) {
+              if (profile_result.elapsed_time_in_ms() <
+                  best_result.elapsed_time_in_ms()) {
+                best_result = profile_result;
+              }
+              if (scratch_allocator.TotalByteSize() == 0 &&
+                  profile_result.elapsed_time_in_ms() <
+                      best_result_no_scratch.elapsed_time_in_ms()) {
+                best_result_no_scratch = profile_result;
+              }
             }
           }
         }
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index bbb9e36fc9..dc03eeb658 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -447,9 +447,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     int col_stride, const Padding& padding, Tensor* output,
     TensorFormat data_format) {
   using perftools::gputools::dnn::AlgorithmConfig;
-  using perftools::gputools::dnn::AlgorithmType;
+  using perftools::gputools::dnn::AlgorithmDesc;
   using perftools::gputools::dnn::ProfileResult;
-  using perftools::gputools::dnn::kDefaultAlgorithm;
   auto* stream = ctx->op_device_context()->stream();
   OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
 
@@ -663,33 +662,38 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
   AlgorithmConfig algorithm_config;
   if (cudnn_use_autotune &&
       !AutoTuneConv::GetInstance()->Find(conv_parameters, &algorithm_config)) {
-    std::vector<AlgorithmType> algorithms;
+    std::vector<AlgorithmDesc::Index> algorithms;
     CHECK(stream->parent()->GetConvolveAlgorithms(
         conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
     ProfileResult best_result;
     ProfileResult best_result_no_scratch;
-    for (auto profile_algorithm : algorithms) {
-      // TODO(zhengxq): profile each algorithm multiple times to better
-      // accuracy.
-      CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
-      ProfileResult profile_result;
-      bool cudnn_launch_status =
-          stream
-              ->ThenConvolveWithAlgorithm(
-                  input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
-                  output_desc, &output_ptr, &scratch_allocator,
-                  AlgorithmConfig(profile_algorithm), &profile_result)
-              .ok();
-      if (cudnn_launch_status) {
-        if (profile_result.is_valid()) {
-          if (profile_result.elapsed_time_in_ms() <
-              best_result.elapsed_time_in_ms()) {
-            best_result = profile_result;
-          }
-          if (scratch_allocator.TotalByteSize() == 0 &&
-              profile_result.elapsed_time_in_ms() <
-                  best_result_no_scratch.elapsed_time_in_ms()) {
-            best_result_no_scratch = profile_result;
+    // TODO(benbarsdell): Ideally this should not attempt using tensor op math
+    // if it's not enabled.
+    for (bool use_tensor_ops : {false, true}) {
+      for (auto algo_index : algorithms) {
+        // TODO(zhengxq): profile each algorithm multiple times to better
+        // accuracy.
+        AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
+        CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+        ProfileResult profile_result;
+        bool cudnn_launch_status =
+            stream
+                ->ThenConvolveWithAlgorithm(
+                    input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
+                    output_desc, &output_ptr, &scratch_allocator,
+                    AlgorithmConfig(profile_algorithm), &profile_result)
+                .ok();
+        if (cudnn_launch_status) {
+          if (profile_result.is_valid()) {
+            if (profile_result.elapsed_time_in_ms() <
+                best_result.elapsed_time_in_ms()) {
+              best_result = profile_result;
+            }
+            if (scratch_allocator.TotalByteSize() == 0 &&
+                profile_result.elapsed_time_in_ms() <
+                    best_result_no_scratch.elapsed_time_in_ms()) {
+              best_result_no_scratch = profile_result;
+            }
           }
         }
       }
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 5dea336b28..72758f707a 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -383,41 +383,45 @@ struct LaunchConvOp<GPUDevice, T> {
     };
 
     using perftools::gputools::dnn::AlgorithmConfig;
-    using perftools::gputools::dnn::AlgorithmType;
+    using perftools::gputools::dnn::AlgorithmDesc;
     using perftools::gputools::dnn::ProfileResult;
-    using perftools::gputools::dnn::kDefaultAlgorithm;
 
     AlgorithmConfig algorithm_config;
 
     if (cudnn_use_autotune && !AutoTuneConv3d::GetInstance()->Find(
                                   conv_parameters, &algorithm_config)) {
-      std::vector<AlgorithmType> algorithms;
+      std::vector<AlgorithmDesc::Index> algorithms;
       CHECK(stream->parent()->GetConvolveAlgorithms(
           conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
       ProfileResult best_result;
       ProfileResult best_result_no_scratch;
-      for (auto profile_algorithm : algorithms) {
-        // TODO(zhengxq): profile each algorithm multiple times to better
-        // accuracy.
-        CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
-        ProfileResult profile_result;
-        bool cudnn_launch_status =
-            stream
-                ->ThenConvolveWithAlgorithm(
-                    input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
-                    output_desc, &output_ptr, &scratch_allocator,
-                    AlgorithmConfig(profile_algorithm), &profile_result)
-                .ok();
-        if (cudnn_launch_status) {
-          if (profile_result.is_valid()) {
-            if (profile_result.elapsed_time_in_ms() <
-                best_result.elapsed_time_in_ms()) {
-              best_result = profile_result;
-            }
-            if (scratch_allocator.TotalByteSize() == 0 &&
-                profile_result.elapsed_time_in_ms() <
-                    best_result_no_scratch.elapsed_time_in_ms()) {
-              best_result_no_scratch = profile_result;
+      // TODO(benbarsdell): Ideally this should not attempt using tensor op math
+      // if it's not enabled.
+      for (bool use_tensor_ops : {false, true}) {
+        for (auto algo_index : algorithms) {
+          AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
+          // TODO(zhengxq): profile each algorithm multiple times to better
+          // accuracy.
+          CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+          ProfileResult profile_result;
+          bool cudnn_launch_status =
+              stream
+                  ->ThenConvolveWithAlgorithm(
+                      input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
+                      output_desc, &output_ptr, &scratch_allocator,
+                      AlgorithmConfig(profile_algorithm), &profile_result)
+                  .ok();
+          if (cudnn_launch_status) {
+            if (profile_result.is_valid()) {
+              if (profile_result.elapsed_time_in_ms() <
+                  best_result.elapsed_time_in_ms()) {
+                best_result = profile_result;
+              }
+              if (scratch_allocator.TotalByteSize() == 0 &&
+                  profile_result.elapsed_time_in_ms() <
+                      best_result_no_scratch.elapsed_time_in_ms()) {
+                best_result_no_scratch = profile_result;
+              }
             }
           }
         }
diff --git a/tensorflow/core/kernels/eigen_attention.h b/tensorflow/core/kernels/eigen_attention.h
index f4c42372b1..887b9b7221 100644
--- a/tensorflow/core/kernels/eigen_attention.h
+++ b/tensorflow/core/kernels/eigen_attention.h
@@ -81,21 +81,26 @@ struct GlimpseExtractionOp {
     for (Index i = 0; i < batch_size; ++i) {
       float x = offsets_[i].first, y = offsets_[i].second;
 
-      // Un-normalize coordinates back to pixel space if normalized.
       if (normalized_) {
+        // Un-normalize coordinates back to pixel space if normalized.
         x *= input_width;
         y *= input_height;
+        if (centered_) {
+          // Un-center if coordinates are centered on the image center.
+          x /= 2.0f;
+          y /= 2.0f;
+          x += input_width / 2.0f;
+          y += input_height / 2.0f;
+          // Remove half of the glimpse window.
+          x -= width_ / 2.0f;
+          y -= height_ / 2.0f;
+        }
+      } else {
+        if (centered_) {
+          x += input_width / 2.0f;
+          y += input_height / 2.0f;
+        }
       }
-      // Un-center if coordinates are centered on the image center.
-      if (centered_) {
-        x /= 2.0f;
-        y /= 2.0f;
-        x += input_width / 2.0f;
-        y += input_height / 2.0f;
-      }
-      // Remove half of the glimpse window.
-      x -= width_ / 2.0f;
-      y -= height_ / 2.0f;
 
       const Index offset_x = (Index) x;
       const Index offset_y = (Index) y;
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index cc303e8dba..92b093eec6 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
-#include "tensorflow/core/kernels/fused_batch_norm_op.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/util/stream_executor_util.h"
@@ -28,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/fused_batch_norm_op.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
@@ -39,7 +39,8 @@ namespace functor {
 // Functor used by FusedBatchNormOp to do the computations.
 template <typename Device, typename T>
 struct FusedBatchNorm;
-// Functor used by FusedBatchNormGradOp to do the computations.
+// Functor used by FusedBatchNormGradOp to do the computations when
+// is_training=True.
 template <typename Device, typename T>
 struct FusedBatchNormGrad;
 
@@ -352,7 +353,7 @@ template <typename T>
 struct FusedBatchNormGrad<GPUDevice, T> {
   void operator()(OpKernelContext* context, const Tensor& y_backprop,
                   const Tensor& x, const Tensor& scale, const Tensor& mean,
-                  const Tensor& variance, T epsilon, Tensor* x_backprop,
+                  const Tensor& inv_variance, T epsilon, Tensor* x_backprop,
                   Tensor* scale_backprop, Tensor* offset_backprop,
                   TensorFormat tensor_format) {
     auto* stream = context->op_device_context()->stream();
@@ -441,16 +442,18 @@ struct FusedBatchNormGrad<GPUDevice, T> {
     auto x_ptr = StreamExecutorUtil::AsDeviceMemory<T>(x_maybe_transformed);
     auto scale_ptr = StreamExecutorUtil::AsDeviceMemory<T>(scale);
     auto mean_ptr = StreamExecutorUtil::AsDeviceMemory<T>(mean);
-    auto variance_ptr = StreamExecutorUtil::AsDeviceMemory<T>(variance);
+    auto inv_variance_ptr = StreamExecutorUtil::AsDeviceMemory<T>(inv_variance);
     auto scale_backprop_ptr =
         StreamExecutorUtil::AsDeviceMemory<T>(*scale_backprop);
     auto offset_backprop_ptr =
         StreamExecutorUtil::AsDeviceMemory<T>(*offset_backprop);
 
+    // the cudnn kernel outputs inverse variance in forward and reuse it in
+    // backward
     bool cudnn_launch_status =
         stream
             ->ThenBatchNormalizationBackward(
-                y_backprop_ptr, x_ptr, scale_ptr, mean_ptr, variance_ptr,
+                y_backprop_ptr, x_ptr, scale_ptr, mean_ptr, inv_variance_ptr,
                 x_desc, scale_offset_desc, static_cast<double>(epsilon),
                 &x_backprop_ptr, &scale_backprop_ptr, &offset_backprop_ptr)
             .ok();
@@ -468,6 +471,20 @@ struct FusedBatchNormGrad<GPUDevice, T> {
     }
   }
 };
+
+// Forward declarations of the functor specializations for GPU.
+#define DECLARE_GPU_SPEC(T)                                              \
+  template <>                                                            \
+  void FusedBatchNormFreezeGrad<GPUDevice, T>::operator()(               \
+      const GPUDevice& d, const Tensor& y_backprop_input,                \
+      const Tensor& x_input, const Tensor& scale_input,                  \
+      const Tensor& mean_input, const Tensor& variance_input, T epsilon, \
+      Tensor* x_backprop_output, Tensor* scale_backprop_output,          \
+      Tensor* offset_backprop_output, typename TTypes<T>::Vec scratch1,  \
+      typename TTypes<T>::Vec scratch2);                                 \
+  extern template struct FusedBatchNormFreezeGrad<GPUDevice, T>;
+DECLARE_GPU_SPEC(float);
+
 #endif  // GOOGLE_CUDA
 }  // namespace functor
 
@@ -511,7 +528,7 @@ class FusedBatchNormOp : public OpKernel {
     if (is_training_) {
       OP_REQUIRES(
           context, estimated_mean.dim_size(0) == 0,
-          errors::InvalidArgument("estimated_mean empty for training",
+          errors::InvalidArgument("estimated_mean must be empty for training",
                                   estimated_mean.shape().DebugString()));
       OP_REQUIRES(context, estimated_variance.dim_size(0) == 0,
                   errors::InvalidArgument(
@@ -531,14 +548,14 @@ class FusedBatchNormOp : public OpKernel {
     Tensor* saved_mean = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(3, scale.shape(), &saved_mean));
-    Tensor* saved_inv_var = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(4, scale.shape(), &saved_inv_var));
+    Tensor* saved_maybe_inv_var = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(4, scale.shape(),
+                                                     &saved_maybe_inv_var));
 
     functor::FusedBatchNorm<Device, T>()(
         context, x, scale, offset, estimated_mean, estimated_variance, epsilon_,
-        y, batch_mean, batch_var, saved_mean, saved_inv_var, tensor_format_,
-        is_training_);
+        y, batch_mean, batch_var, saved_mean, saved_maybe_inv_var,
+        tensor_format_, is_training_);
   }
 
  private:
@@ -559,16 +576,21 @@ class FusedBatchNormGradOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format));
     OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_),
                 errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("is_training", &is_training_));
   }
 
   void Compute(OpKernelContext* context) override {
     const Tensor& y_backprop = context->input(0);
     const Tensor& x = context->input(1);
     const Tensor& scale = context->input(2);
-    const Tensor& saved_mean = context->input(3);
-    // The Eigen implementation saves variance in the forward pass,  while cuDNN
+    // When is_training=True, batch mean and variance/inverted variance are
+    // saved in the forward pass to be reused here. When is_training=False,
+    // population mean and variance need to be forwarded here to compute the
+    // gradients.
+    const Tensor& saved_mean_or_pop_mean = context->input(3);
+    // The Eigen implementation saves variance in the forward pass, while cuDNN
     // saves inverted variance.
-    const Tensor& saved_maybe_inv_var = context->input(4);
+    const Tensor& saved_maybe_inv_var_or_pop_var = context->input(4);
 
     OP_REQUIRES(context, y_backprop.dims() == 4,
                 errors::InvalidArgument("input must be 4-dimensional",
@@ -579,13 +601,14 @@ class FusedBatchNormGradOp : public OpKernel {
     OP_REQUIRES(context, scale.dims() == 1,
                 errors::InvalidArgument("scale must be 1-dimensional",
                                         scale.shape().DebugString()));
-    OP_REQUIRES(context, saved_mean.dims() == 1,
-                errors::InvalidArgument("saved mean must be 1-dimensional",
-                                        saved_mean.shape().DebugString()));
     OP_REQUIRES(
-        context, saved_maybe_inv_var.dims() == 1,
-        errors::InvalidArgument("saved variance must be 1-dimensional",
-                                saved_maybe_inv_var.shape().DebugString()));
+        context, saved_mean_or_pop_mean.dims() == 1,
+        errors::InvalidArgument("saved mean must be 1-dimensional",
+                                saved_mean_or_pop_mean.shape().DebugString()));
+    OP_REQUIRES(context, saved_maybe_inv_var_or_pop_var.dims() == 1,
+                errors::InvalidArgument(
+                    "saved variance must be 1-dimensional",
+                    saved_maybe_inv_var_or_pop_var.shape().DebugString()));
 
     Tensor* x_backprop = nullptr;
     OP_REQUIRES_OK(context,
@@ -607,14 +630,37 @@ class FusedBatchNormGradOp : public OpKernel {
     OP_REQUIRES_OK(
         context, context->allocate_output(4, TensorShape({}), &placeholder_2));
 
-    functor::FusedBatchNormGrad<Device, T>()(
-        context, y_backprop, x, scale, saved_mean, saved_maybe_inv_var,
-        epsilon_, x_backprop, scale_backprop, offset_backprop, tensor_format_);
+    if (is_training_) {
+      functor::FusedBatchNormGrad<Device, T>()(
+          context, y_backprop, x, scale, saved_mean_or_pop_mean,
+          saved_maybe_inv_var_or_pop_var, epsilon_, x_backprop, scale_backprop,
+          offset_backprop, tensor_format_);
+
+    } else {
+      // Necessary layout conversion is currently done in python.
+      CHECK(tensor_format_ == FORMAT_NHWC)
+          << "The implementation of FusedBatchNormGrad with is_training=False "
+             "only support "
+          << "NHWC tensor format for now.";
+      Tensor scratch1, scratch2;
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DataTypeToEnum<T>::value,
+                                            scale_offset_shape, &scratch1));
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DataTypeToEnum<T>::value,
+                                            scale_offset_shape, &scratch2));
+      functor::FusedBatchNormFreezeGrad<Device, T>()(
+          context->eigen_device<Device>(), y_backprop, x, scale,
+          saved_mean_or_pop_mean, saved_maybe_inv_var_or_pop_var, epsilon_,
+          x_backprop, scale_backprop, offset_backprop, scratch1.vec<T>(),
+          scratch2.vec<T>());
+    }
   }
 
  private:
   T epsilon_;
   TensorFormat tensor_format_;
+  bool is_training_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("FusedBatchNorm").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
index 5146ca626a..6157aae2aa 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
@@ -22,6 +22,8 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 
+template struct FusedBatchNormFreezeGrad<Eigen::GpuDevice, float>;
+
 template <class T>
 __global__ void VarianceToInvVarianceKernel(int nthreads, const T* input,
                                             double epsilon, T* output) {
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.h b/tensorflow/core/kernels/fused_batch_norm_op.h
index da8692caad..1566cfa4dc 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.h
+++ b/tensorflow/core/kernels/fused_batch_norm_op.h
@@ -17,9 +17,14 @@ limitations under the License.
 #define TENSORFLOW_KERNELS_FUSED_BATCH_NORM_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
 
 namespace tensorflow {
 namespace functor {
+
+#if GOOGLE_CUDA
+
 // There is a behavior difference between cuDNN v4 and v5 with regard to the
 // scaling factor for function cudnnBatchNormalizationForwardInference.
 // This function corrects the scaling factor if cuDNN v4 is used, so that
@@ -43,6 +48,72 @@ struct InvVarianceToVariance {
   void operator()(const Eigen::GpuDevice& d, double epsilon, int sample_size,
                   int channels, T* variance);
 };
+
+#endif  // GOOGLE_CUDA
+
+// Functor used by FusedBatchNormGradOp to do the computations when
+// is_training=False. Both CPU and GPU will use this functor.
+template <typename Device, typename T>
+struct FusedBatchNormFreezeGrad {
+  void operator()(const Device& d, const Tensor& y_backprop_input,
+                  const Tensor& x_input, const Tensor& scale_input,
+                  const Tensor& pop_mean_input,
+                  const Tensor& pop_variance_input, T epsilon,
+                  Tensor* x_backprop_output, Tensor* scale_backprop_output,
+                  Tensor* offset_backprop_output,
+                  typename TTypes<T>::Vec scratch1,
+                  typename TTypes<T>::Vec scratch2) {
+    typename TTypes<T, 4>::ConstTensor y_backprop(
+        y_backprop_input.tensor<T, 4>());
+    typename TTypes<T, 4>::ConstTensor input(x_input.tensor<T, 4>());
+    typename TTypes<T>::ConstVec scale(scale_input.vec<T>());
+    typename TTypes<T>::ConstVec pop_mean(pop_mean_input.vec<T>());
+    typename TTypes<T>::ConstVec pop_var(pop_variance_input.vec<T>());
+    typename TTypes<T, 4>::Tensor x_backprop(x_backprop_output->tensor<T, 4>());
+    typename TTypes<T>::Vec scale_backprop(scale_backprop_output->vec<T>());
+    typename TTypes<T>::Vec offset_backprop(offset_backprop_output->vec<T>());
+
+    const int depth = pop_mean.dimension(0);
+    const int rest_size = input.size() / depth;
+
+    Eigen::DSizes<Eigen::Index, 2> rest_by_depth(rest_size, depth);
+#if !defined(EIGEN_HAS_INDEX_LIST)
+    Eigen::DSizes<Eigen::Index, 2> one_by_depth(1, depth);
+    Eigen::array<int, 1> reduction_axis{0};
+    Eigen::array<int, 2> rest_by_one({rest_size, 1});
+#else
+    Eigen::IndexList<Eigen::type2index<1>, Eigen::Index> one_by_depth;
+    one_by_depth.set(1, depth);
+    Eigen::IndexList<Eigen::type2index<0> > reduction_axis;
+    Eigen::IndexList<Eigen::Index, Eigen::type2index<1> > rest_by_one;
+    rest_by_one.set(0, rest_size);
+#endif
+
+    // offset_backprop  = sum(y_backprop)
+    // scale_backprop = y_backprop * ((x - pop_mean) * rsqrt(pop_var + epsilon))
+    // x_backprop = y_backprop * (scale * rsqrt(pop_var + epsilon))
+    offset_backprop.device(d) =
+        y_backprop.reshape(rest_by_depth).sum(reduction_axis);
+
+    // scratch1 = rsqrt(pop_var + epsilon)
+    scratch1.device(d) = (pop_var + pop_var.constant(epsilon)).rsqrt();
+
+    // scratch2 = sum(y_backprop * (x - mean))
+    scratch2.device(d) =
+        (y_backprop.reshape(rest_by_depth) *
+         (input.reshape(rest_by_depth) -
+          pop_mean.reshape(one_by_depth).broadcast(rest_by_one)))
+            .sum(reduction_axis);
+
+    x_backprop.reshape(rest_by_depth).device(d) =
+        y_backprop.reshape(rest_by_depth) * ((scratch1 * scale)
+                                                 .eval()
+                                                 .reshape(one_by_depth)
+                                                 .broadcast(rest_by_one));
+    scale_backprop.device(d) = scratch2 * scratch1;
+  }
+};
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index 51ba127def..935eb81dd0 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -54,17 +54,62 @@ class MklAddNOp : public OpKernel {
     GetMklShape(ctx, 1, &(mkl_context.input2_shape));
     bool input2_in_mkl_format = mkl_context.input2_shape.IsMklTensor();
 
+    // handle the case of a scalar
+    if (!input1_in_mkl_format && input0.dims() == 0) {
+      const TensorShape& o_shape = input0.shape();
+      Tensor* out_tensor = nullptr;
+      mkl_context.output_shape.SetMklTensor(false);
+      AllocateOutputSetMklShape(ctx, 0, &out_tensor, o_shape,
+                                mkl_context.output_shape);
+      float user_i1 = (input0.scalar<T>()());
+      ;
+      float user_i2 = (input1.scalar<T>()());
+      ;
+      out_tensor->scalar<T>()() = std::plus<float>{}(user_i1, user_i2);
+      return;
+    }
+
     mkl_context.in_dims = input1_in_mkl_format
         ? mkl_context.input1_shape.GetDimension()
         : input0.dims();
     mkl_context.in_dims = input2_in_mkl_format
         ? mkl_context.input2_shape.GetDimension()
         : input1.dims();
+
+    // If there is nothing to compute, return.
+    if (!input1_in_mkl_format && !input2_in_mkl_format) {
+      const TensorShape& o_shape = input0.shape();
+      if (o_shape.num_elements() == 0) {
+        Tensor* out_tensor = nullptr;
+        mkl_context.output_shape.SetMklTensor(false);
+        AllocateOutputSetMklShape(ctx, 0, &out_tensor, o_shape,
+                                  mkl_context.output_shape);
+        return;
+      }
+    }
+
+    mkl_context.in_sizes = new size_t[mkl_context.in_dims];
+    mkl_context.in_strides = new size_t[mkl_context.in_dims];
     // Generate size, stride for input if input is in MKL format.
-    ExtractMklOpParams(&mkl_context.in1_sizes,
-     &mkl_context.in1_strides, input0, &mkl_context.input1_shape);
-    ExtractMklOpParams(&mkl_context.in2_sizes,
-     &mkl_context.in2_strides, input1, &mkl_context.input2_shape);
+    if (input1_in_mkl_format || input2_in_mkl_format) {
+      const MklShape* tmp_mkl_shape = (input1_in_mkl_format)
+                                          ? &mkl_context.input1_shape
+                                          : &mkl_context.input2_shape;
+      for (int i = 0; i < mkl_context.in_dims; i++) {
+        mkl_context.in_sizes[i] = tmp_mkl_shape->GetSizes()[i];
+        mkl_context.in_strides[i] = tmp_mkl_shape->GetStrides()[i];
+      }
+    } else {
+      for (int i = 0; i < mkl_context.in_dims; i++) {
+        mkl_context.in_sizes[i] =
+            input0.dim_size((mkl_context.in_dims - 1) - i);
+      }
+      mkl_context.in_strides[0] = 1;
+      for (int i = 1; i < mkl_context.in_dims; i++) {
+        mkl_context.in_strides[i] =
+            mkl_context.in_strides[i - 1] * mkl_context.in_sizes[i - 1];
+      }
+    }
 
     std::vector<float> coeff(2, 1.0);
     mkl_context.MklCreateInputLayouts(ctx);
@@ -82,7 +127,7 @@ class MklAddNOp : public OpKernel {
      mkl_context.output_shape.SetMklLayout(mkl_context.Eltwise, dnnResourceDst);
 
      mkl_context.output_shape.SetTfLayout(
-        mkl_context.in_dims, mkl_context.in1_sizes, mkl_context.in1_strides);
+         mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
      if (input1_in_mkl_format == true) {
       mkl_context.output_shape.SetTfDimOrder(mkl_context.in_dims,
       mkl_context.input1_shape.GetTfToMklDimMap());
@@ -113,44 +158,11 @@ class MklAddNOp : public OpKernel {
     mkl_context.MklCleanup();
   }
 
-  void ExtractMklOpParams(size_t** out_sizes, size_t** out_strides,
-    const Tensor& input, const MklShape* input_shape) {
-    bool input_in_mkl_format = input_shape->IsMklTensor();
-    int in_dims = input_in_mkl_format
-                              ? input_shape->GetDimension()
-                              : input.dims();
-    size_t* in_sizes = new size_t[in_dims];
-    size_t* in_strides = new size_t[in_dims];
-
-    if (input_in_mkl_format) {
-      for (int i = 0; i < in_dims; i++) {
-        in_sizes[i] = input_shape->GetSizes()[i];
-        in_strides[i] = input_shape->GetStrides()[i];
-      }
-    } else {
-      for (int i = 0; i < in_dims; i++) {
-        in_sizes[i] =
-            input.dim_size((in_dims - 1) - i);
-      }
-      in_strides[0] = 1;
-      for (int i = 1; i < in_dims; i++) {
-        in_strides[i] =
-            in_strides[i - 1] * in_sizes[i - 1];
-      }
-    }
-    *out_sizes = in_sizes;
-    *out_strides = in_strides;
-  }
-
-
  private:
   typedef struct {
     int in_dims;
-    size_t* in1_sizes;
-    size_t* in1_strides;
-
-    size_t* in2_sizes;
-    size_t* in2_strides;
+    size_t* in_sizes = nullptr;
+    size_t* in_strides = nullptr;
     dnnPrimitive_t Eltwise = nullptr;
     dnnPrimitiveAttributes_t attributes = nullptr;
     void* Eltwise_res[dnnResourceNumber];
@@ -160,18 +172,16 @@ class MklAddNOp : public OpKernel {
     void MklCreateInputLayouts(OpKernelContext* context) {
       bool input1_in_mkl_format = input1_shape.IsMklTensor();
       if (!input1_in_mkl_format) {
-        CHECK_EQ(
-            dnnLayoutCreate_F32(&lt_input1, in_dims, in1_sizes, in1_strides),
-            E_SUCCESS);
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input1, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
       } else {
         lt_input1 = static_cast<dnnLayout_t>(input1_shape.GetCurLayout());
       }
 
       bool input2_in_mkl_format = input2_shape.IsMklTensor();
       if (!input2_in_mkl_format) {
-        CHECK_EQ(
-            dnnLayoutCreate_F32(&lt_input2, in_dims, in2_sizes, in2_strides),
-            E_SUCCESS);
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input2, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
       } else {
         lt_input2 = static_cast<dnnLayout_t>(input2_shape.GetCurLayout());
       }
@@ -246,15 +256,15 @@ class MklAddNOp : public OpKernel {
       bool input1_in_mkl_format = input1_shape.IsMklTensor();
       bool input2_in_mkl_format = input2_shape.IsMklTensor();
       dnnDelete_F32(Eltwise);
+      if (!input1_in_mkl_format || !input2_in_mkl_format) {
+        delete[] in_sizes;
+        delete[] in_strides;
+      }
       if (!input1_in_mkl_format) {
          dnnLayoutDelete_F32(lt_input1);
-         delete [] in1_sizes;
-         delete [] in1_strides;
       }
       if (!input2_in_mkl_format) {
          dnnLayoutDelete_F32(lt_input2);
-         delete [] in2_sizes;
-         delete [] in2_strides;
       }
     }
   } MklAddNOpContext;
diff --git a/tensorflow/core/kernels/pooling_ops_3d_sycl.h b/tensorflow/core/kernels/pooling_ops_3d_sycl.h
index d8cbc589a1..c1bc5af498 100644
--- a/tensorflow/core/kernels/pooling_ops_3d_sycl.h
+++ b/tensorflow/core/kernels/pooling_ops_3d_sycl.h
@@ -213,7 +213,7 @@ struct LaunchPoolingOp<SYCLDevice, T, MAX> {
   }
 };
 // MaxPool3DGrad SYCL kernel. Expects the number of threads to be equal to the
-// number of elements in the output backprop tenor (i.e. the number of elements
+// number of elements in the output backprop tensor (i.e. the number of elements
 // in the input data tensor).
 //
 // For each output backprop element we compute the possible window of values in
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem.cc b/tensorflow/core/lib/jpeg/jpeg_mem.cc
index 3c7e5ca696..50ed8bdb3b 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_mem.cc
@@ -173,7 +173,7 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
   JDIMENSION target_output_width = cinfo.output_width;
   JDIMENSION target_output_height = cinfo.output_height;
   JDIMENSION skipped_scanlines = 0;
-#if !defined(WIN32)
+#if defined(LIBJPEG_TURBO_VERSION)
   if (flags.crop) {
     // Update target output height and width based on crop window.
     target_output_height = flags.crop_height;
@@ -219,7 +219,7 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
   argball->height_ = target_output_height;
   argball->stride_ = stride;
 
-#if defined(WIN32)
+#if !defined(LIBJPEG_TURBO_VERSION)
   uint8* dstdata = nullptr;
   if (flags.crop) {
     dstdata = new JSAMPLE[stride * target_output_height];
@@ -336,7 +336,7 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
   delete[] tempdata;
   tempdata = nullptr;
 
-#if !defined(WIN32)
+#if defined(LIBJPEG_TURBO_VERSION)
   if (flags.crop && cinfo.output_scanline < cinfo.output_height) {
     // Skip the rest of scanlines, required by jpeg_destroy_decompress.
     jpeg_skip_scanlines(&cinfo,
@@ -418,7 +418,7 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
       break;
   }
 
-#if defined(WIN32)
+#if !defined(LIBJPEG_TURBO_VERSION)
   // TODO(tanmingxing): delete all these code after migrating to libjpeg_turbo
   // for Windows.
   if (flags.crop) {
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 62c86c7714..5dab451fce 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -635,15 +635,7 @@ REGISTER_OP("ImmutableConst")
     .Attr("shape: shape")
     .Attr("memory_region_name: string")
     .Output("tensor: dtype")
-    .SetShapeFn([](InferenceContext* c) {
-      TensorShape shape_from_attr;
-      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape_from_attr));
-      ShapeHandle output_shape;
-      TF_RETURN_IF_ERROR(
-          c->MakeShapeFromPartialTensorShape(shape_from_attr, &output_shape));
-      c->set_output(0, output_shape);
-      return Status::OK();
-    })
+    .SetShapeFn(shape_inference::ExplicitShape)
     .Doc(R"doc(
 Returns immutable tensor from memory region.
 
@@ -1307,15 +1299,7 @@ REGISTER_OP("_ParallelConcatStart")
     .Attr("shape: shape")
     .Attr("dtype: type")
     .SetIsStateful()
-    .SetShapeFn([](InferenceContext* c) {
-      PartialTensorShape shape;
-      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
-      ShapeHandle output_shape;
-      TF_RETURN_IF_ERROR(
-          c->MakeShapeFromPartialTensorShape(shape, &output_shape));
-      c->set_output(0, output_shape);
-      return Status::OK();
-    })
+    .SetShapeFn(shape_inference::ExplicitShape)
     .Doc(R"doc(
 Creates an empty Tensor with shape `shape` and type `dtype`.
 
@@ -3083,14 +3067,7 @@ REGISTER_OP("PlaceholderV2")
     .Output("output: dtype")
     .Attr("dtype: type")
     .Attr("shape: shape")
-    .SetShapeFn([](InferenceContext* c) {
-      PartialTensorShape shape;
-      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
-      ShapeHandle output;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &output));
-      c->set_output(0, output);
-      return Status::OK();
-    })
+    .SetShapeFn(shape_inference::ExplicitShape)
     .Deprecated(23, "Placeholder now behaves the same as PlaceholderV2.")
     .Doc(R"doc(
 A placeholder op for a value that will be fed into the computation.
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 22afa4db9a..bcfdada329 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -329,7 +329,7 @@ batch_variance: A 1D Tensor for the computed batch variance, to be used by
 reserve_space_1: A 1D Tensor for the computed batch mean, to be reused
                  in the gradient computation.
 reserve_space_2: A 1D Tensor for the computed batch variance (inverted variance
-                 in the cuDNN case), to be used in the gradient computation.
+                 in the cuDNN case), to be reused in the gradient computation.
 T: The data type for the elements of input and output Tensors.
 epsilon: A small float number added to the variance of x.
 data_format: The data format for x and y. Either "NHWC" (default) or "NCHW".
@@ -409,10 +409,14 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
 y_backprop: A 4D Tensor for the gradient with respect to y.
 x: A 4D Tensor for input data.
 scale: A 1D Tensor for scaling factor, to scale the normalized x.
-reserve_space_1: A 1D Tensor for the computed batch mean, to be reused
-                 in the gradient computation.
-reserve_space_2: A 1D Tensor for the computed batch variance (inverted variance
-                 in the cuDNN case), to be used in the gradient computation.
+reserve_space_1: When is_training is True, a 1D Tensor for the computed batch mean
+                 to be reused in gradient computation.
+                 When is_training is False, a 1D Tensor for the population mean
+                 to be reused in both 1st and 2nd order gradient computation.
+reserve_space_2: When is_training is True, a 1D Tensor for the computed batch variance
+                 (inverted variance in the cuDNN case) to be reused in gradient computation.
+                 When is_training is False, a 1D Tensor for the population variance
+                 to be reused in both 1st and 2nd order gradient computation.
 x_backprop: A 4D Tensor for the gradient with respect to x.
 scale_backprop: A 1D Tensor for the gradient with respect to scale.
 offset_backprop: A 1D Tensor for the gradient with respect to offset.
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 4819dbadba..039be0d0c4 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -15868,6 +15868,25 @@ op {
   summary: "Transforms a serialized tensorflow.TensorProto proto into a Tensor."
 }
 op {
+  name: "SerializeTensor"
+  input_arg {
+    name: "tensor"
+    description: "A Tensor of type `T`."
+    type: "T"
+  }
+  output_arg {
+    name: "serialized"
+    description: "A serialized TensorProto proto of the input tensor."
+    type_attr: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    description: "The type of the input tensor."
+  }
+  summary: "Transforms a Tensor into a serialized TensorProto proto."
+}
+op {
   name: "Placeholder"
   output_arg {
     name: "output"
diff --git a/tensorflow/core/ops/state_ops.cc b/tensorflow/core/ops/state_ops.cc
index 7cf5dfcca8..dd3840d01c 100644
--- a/tensorflow/core/ops/state_ops.cc
+++ b/tensorflow/core/ops/state_ops.cc
@@ -28,15 +28,7 @@ REGISTER_OP("VariableV2")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn([](InferenceContext* c) {
-      PartialTensorShape shape;
-      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
-      ShapeHandle output_shape;
-      TF_RETURN_IF_ERROR(
-          c->MakeShapeFromPartialTensorShape(shape, &output_shape));
-      c->set_output(0, output_shape);
-      return Status::OK();
-    })
+    .SetShapeFn(shape_inference::ExplicitShape)
     .Doc(R"doc(
 Holds state in the form of a tensor that persists across steps.
 
@@ -99,14 +91,7 @@ REGISTER_OP("TemporaryVariable")
     .Attr("dtype: type")
     .Attr("var_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn([](InferenceContext* c) {
-      PartialTensorShape shape;
-      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
-      ShapeHandle output;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &output));
-      c->set_output(0, output);
-      return Status::OK();
-    })
+    .SetShapeFn(shape_inference::ExplicitShape)
     .Doc(R"doc(
 Returns a tensor that may be mutated, but only persists within a single step.
 
diff --git a/tensorflow/core/platform/cpu_info.cc b/tensorflow/core/platform/cpu_info.cc
index 5cade740e4..99de364042 100644
--- a/tensorflow/core/platform/cpu_info.cc
+++ b/tensorflow/core/platform/cpu_info.cc
@@ -255,7 +255,6 @@ class CPUIDInfo {
   int model_num() { return model_num_; }
 
  private:
-  int highest_eax_;
   int have_adx_ : 1;
   int have_aes_ : 1;
   int have_avx_ : 1;
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 9ba3a509c3..ccb861c93a 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,12 +19,12 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 4
+#define TF_MINOR_VERSION 3
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-dev"
+#define TF_VERSION_SUFFIX ""
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/docs_src/api_guides/python/contrib.layers.md b/tensorflow/docs_src/api_guides/python/contrib.layers.md
index d4cda3a254..b85db4b96f 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.layers.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.layers.md
@@ -102,6 +102,7 @@ Feature columns provide a mechanism to map data to a model.
 *   @{tf.contrib.layers.sparse_column_with_hash_bucket}
 *   @{tf.contrib.layers.sparse_column_with_integerized_feature}
 *   @{tf.contrib.layers.sparse_column_with_keys}
+*   @{tf.contrib.layers.sparse_column_with_vocabulary_file}
 *   @{tf.contrib.layers.weighted_sparse_column}
 *   @{tf.contrib.layers.weighted_sum_from_feature_columns}
 *   @{tf.contrib.layers.infer_real_valued_columns}
diff --git a/tensorflow/docs_src/api_guides/python/reading_data.md b/tensorflow/docs_src/api_guides/python/reading_data.md
index c0ddd82d73..8b6196ea34 100644
--- a/tensorflow/docs_src/api_guides/python/reading_data.md
+++ b/tensorflow/docs_src/api_guides/python/reading_data.md
@@ -496,6 +496,6 @@ that allow the user to change the input pipeline without rebuilding the graph or
 session.
 
 Note: Regardless of the implementation, many
-operations (like ${tf.layers.batch_normalization}, and @{tf.layers.dropout})
+operations (like @{tf.layers.batch_normalization}, and @{tf.layers.dropout})
 need to know if they are in training or evaluation mode, and you must be
 careful to set this appropriately if you change the data source.
diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index 120207e802..7d71fb5f4a 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -444,19 +444,19 @@ Now that you know how to build a basic (and somewhat restricted) op and
 implementation, we'll look at some of the more complicated things you will
 typically need to build into your op. This includes:
 
-*   [Conditional checks and validation](#validate)
-*   Op registration
+*   [Conditional checks and validation](#conditional_checks_and_validation)
+*   [Op registration](#op_registration)
     *   [Attrs](#attrs)
-    *   [Attr types](#attr-types)
+    *   [Attr types](#attr_types)
     *   [Polymorphism](#polymorphism)
-    *   [Inputs and outputs](#inputs-outputs)
-    *   [Backwards compatibility](#backward-compat)
-*   [GPU support](#gpu-support)
-    *   [Compiling the kernel for the GPU device](#compiling-kernel)
-*   [Implement the gradient in Python](#implement-gradient)
-*   [Shape functions in C++](#shape-functions)
+    *   [Inputs and outputs](#inputs_and_outputs)
+    *   [Backwards compatibility](#backwards_compatibility)
+*   [GPU support](#gpu_support)
+    *   [Compiling the kernel for the GPU device](#compiling_the_kernel_for_the_gpu_device)
+*   [Implement the gradient in Python](#implement_the_gradient_in_python)
+*   [Shape functions in C++](#shape_functions_in_c)
 
-### Conditional checks and validation {#validate}
+### Conditional checks and validation
 
 The example above assumed that the op applied to a tensor of any shape.  What
 if it only applied to vectors?  That means adding a check to the above OpKernel
@@ -497,7 +497,7 @@ function on error.
 
 ### Op registration
 
-#### Attrs {#attrs}
+#### Attrs
 
 Ops can have attrs, whose values are set when the op is added to a graph. These
 are used to configure the op, and their values can be accessed both within the
@@ -519,7 +519,7 @@ using the `Attr` method, which expects a spec of the form:
 
 where `<name>` begins with a letter and can be composed of alphanumeric
 characters and underscores, and `<attr-type-expr>` is a type expression of the
-form [described below](#attr-types).
+form [described below](#attr_types).
 
 For example, if you'd like the `ZeroOut` op to preserve a user-specified index,
 instead of only the 0th element, you can register the op like so:
@@ -530,7 +530,7 @@ REGISTER\_OP("ZeroOut")
     .Output("zeroed: int32");
 </code></pre>
 
-(Note that the set of [attribute types](#attr-types) is different from the
+(Note that the set of [attribute types](#attr_types) is different from the
 @{tf.DType$tensor types} used for inputs and outputs.)
 
 Your kernel can then access this attr in its constructor via the `context`
@@ -574,7 +574,7 @@ which can then be used in the `Compute` method:
   }
 </code></pre>
 
-#### Attr types {#attr-types}
+#### Attr types
 
 The following types are supported in an attr:
 
@@ -707,7 +707,7 @@ REGISTER_OP("AttrDefaultExampleForAllTypes")
 Note in particular that the values of type `type`
 use @{tf.DType$the `DT_*` names for the types}.
 
-#### Polymorphism {#polymorphism}
+#### Polymorphism
 
 ##### Type Polymorphism
 
@@ -1009,7 +1009,7 @@ REGISTER_OP("MinimumLengthPolymorphicListExample")
     .Output("out: T");
 ```
 
-#### Inputs and Outputs {#inputs-outputs}
+#### Inputs and Outputs
 
 To summarize the above, an op registration can have multiple inputs and outputs:
 
@@ -1110,7 +1110,7 @@ expressions:
 For more details, see
 [`tensorflow/core/framework/op_def_builder.h`][op_def_builder].
 
-#### Backwards compatibility {#backward-compat}
+#### Backwards compatibility
 
 Let's assume you have written a nice, custom op and shared it with others, so
 you have happy customers using your operation.  However, you'd like to make
@@ -1172,7 +1172,7 @@ new optional arguments to the end.  Generally incompatible changes may only be
 made when TensorFlow's changes major versions, and must conform to the
 @{$version_compat#compatibility_of_graphs_and_checkpoints$`GraphDef` version semantics}.
 
-### GPU Support {#gpu-support}
+### GPU Support
 
 You can implement different OpKernels and register one for CPU and another for
 GPU, just like you can [register kernels for different types](#polymorphism).
@@ -1204,7 +1204,7 @@ kept on the CPU, add a `HostMemory()` call to the kernel registration, e.g.:
                           PadOp<GPUDevice, T>)
 ```
 
-#### Compiling the kernel for the GPU device {#compiling-kernel}
+#### Compiling the kernel for the GPU device
 
 Look at
 [cuda_op_kernel.cu.cc](https://www.tensorflow.org/code/tensorflow/examples/adding_an_op/cuda_op_kernel.cu.cc)
@@ -1237,7 +1237,7 @@ For example, add `-L /usr/local/cuda-8.0/lib64/` if your CUDA is installed in
 
 >   Note in some linux settings, additional options to `nvcc` compiling step are needed. Add `-D_MWAITXINTRIN_H_INCLUDED` to the `nvcc` command line to avoid errors from `mwaitxintrin.h`.
 
-### Implement the gradient in Python {#implement-gradient}
+### Implement the gradient in Python
 
 Given a graph of ops, TensorFlow uses automatic differentiation
 (backpropagation) to add new ops representing gradients with respect to the
@@ -1317,7 +1317,7 @@ Note that at the time the gradient function is called, only the data flow graph
 of ops is available, not the tensor data itself.  Thus, all computation must be
 performed using other tensorflow ops, to be run at graph execution time.
 
-### Shape functions in C++ {#shape-functions}
+### Shape functions in C++
 
 The TensorFlow API has a feature called "shape inference" that provides
 information about the shapes of tensors without having to execute the
diff --git a/tensorflow/docs_src/get_started/mnist/beginners.md b/tensorflow/docs_src/get_started/mnist/beginners.md
index 193dd41b2a..38c467ddc3 100644
--- a/tensorflow/docs_src/get_started/mnist/beginners.md
+++ b/tensorflow/docs_src/get_started/mnist/beginners.md
@@ -180,11 +180,11 @@ You can think of it as converting tallies
 of evidence into probabilities of our input being in each class.
 It's defined as:
 
-$$\text{softmax}(x) = \text{normalize}(\exp(x))$$
+$$\text{softmax}(evidence) = \text{normalize}(\exp(evidence))$$
 
 If you expand that equation out, you get:
 
-$$\text{softmax}(x)_i = \frac{\exp(x_i)}{\sum_j \exp(x_j)}$$
+$$\text{softmax}(evidence)_i = \frac{\exp(evidence_i)}{\sum_j \exp(evidence_j)}$$
 
 But it's often more helpful to think of softmax the first way: exponentiating
 its inputs and then normalizing them.  The exponentiation means that one more
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 04cd462848..7ebf5c4a2c 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -35,7 +35,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for Mac OS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.4.0-dev.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.3.0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index b7fa1fe39a..b991fd0f93 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -35,7 +35,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.4.0-dev.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.3.0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index e1200dde12..2adcd4da73 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -34,7 +34,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.4.0-dev</version>
+  <version>1.3.0</version>
 </dependency>
 ```
 
@@ -63,7 +63,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.4.0-dev</version>
+                 <version>1.3.0</version>
                </dependency>
              </dependencies>
          </project>
@@ -122,7 +122,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or Mac OS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.4.0-dev.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.3.0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -141,7 +141,7 @@ Take the following steps to install TensorFlow for Java on Linux or Mac OS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.4.0-dev.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.3.0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -149,10 +149,10 @@ Take the following steps to install TensorFlow for Java on Linux or Mac OS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.4.0-dev.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.3.0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.4.0-dev.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.3.0.zip).
   3. Extract this .zip file.
 
 
@@ -200,7 +200,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.4.0-dev.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.3.0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -214,11 +214,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and Mac OS X:
 
-<pre><b>java -cp libtensorflow-1.4.0-dev.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.3.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.4.0-dev.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.3.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index b759797082..576099f054 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -445,7 +445,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
   2. Create a conda environment named <tt>tensorflow</tt> to run a version
      of Python by invoking the following command:
 
-     <pre>$ <b>conda create -n tensorflow</b></pre>
+     <pre>$ <b>conda create -n tensorflow python=2.7 # or python=3.3, etc.</b></pre>
 
   3. Activate the conda environment by issuing the following command:
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 448e300b17..b6daeb0dd6 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -321,7 +321,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
   2. Create a conda environment named `tensorflow`
      by invoking the following command:
 
-     <pre>$ <b>conda create -n tensorflow</b></pre>
+     <pre>$ <b>conda create -n tensorflow python=2.7 # or python=3.3, etc.</b></pre>
 
   3. Activate the conda environment by issuing the following command:
 
diff --git a/tensorflow/examples/android/README.md b/tensorflow/examples/android/README.md
index bed8e21498..79202a38d7 100644
--- a/tensorflow/examples/android/README.md
+++ b/tensorflow/examples/android/README.md
@@ -26,7 +26,7 @@ on API >= 14 devices.
         in an overlay on the camera image.
 2. [TF Detect](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java):
         Demonstrates an SSD-Mobilenet model trained using the
-        [Tensorflow Object Detection API](https://github.com/tensorflow/models/tree/master/object_detection/)
+        [Tensorflow Object Detection API](https://github.com/tensorflow/models/tree/master/research/object_detection/)
         introduced in [Speed/accuracy trade-offs for modern convolutional object detectors](https://arxiv.org/abs/1611.10012) to
         localize and track objects (from 80 categories) in the camera preview
         in real-time.
diff --git a/tensorflow/examples/get_started/regression/BUILD b/tensorflow/examples/get_started/regression/BUILD
index 779fa1e554..334c8096c1 100644
--- a/tensorflow/examples/get_started/regression/BUILD
+++ b/tensorflow/examples/get_started/regression/BUILD
@@ -32,6 +32,5 @@ py_test(
     deps = [
         "//tensorflow:tensorflow_py",
         "//third_party/py/numpy",
-        "//third_party/py/pandas",
     ],
 )
diff --git a/tensorflow/examples/speech_commands/train.py b/tensorflow/examples/speech_commands/train.py
index c92c38b23c..a54bcbdb32 100644
--- a/tensorflow/examples/speech_commands/train.py
+++ b/tensorflow/examples/speech_commands/train.py
@@ -75,6 +75,7 @@ import os.path
 import sys
 
 import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 
 import input_data
@@ -113,8 +114,8 @@ def main(_):
   # example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001
   # will run 13,000 training loops in total, with a rate of 0.001 for the first
   # 10,000, and 0.0001 for the final 3,000.
-  training_steps_list = map(int, FLAGS.how_many_training_steps.split(','))
-  learning_rates_list = map(float, FLAGS.learning_rate.split(','))
+  training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
+  learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
   if len(training_steps_list) != len(learning_rates_list):
     raise Exception(
         '--how_many_training_steps and --learning_rate must be equal length '
diff --git a/tensorflow/go/session.go b/tensorflow/go/session.go
index afa73030b8..fc914f86df 100644
--- a/tensorflow/go/session.go
+++ b/tensorflow/go/session.go
@@ -89,6 +89,10 @@ func (s *Session) Run(feeds map[Output]*Tensor, fetches []Output, targets []*Ope
 		ptrOutput(c.fetches), ptrTensor(c.fetchTensors), C.int(len(fetches)),
 		ptrOperation(c.targets), C.int(len(targets)),
 		nil, status.c)
+
+	// Make sure GC won't harvest input tensors until SessionRun() is finished
+	runtime.KeepAlive(feeds)
+
 	if err := status.Err(); err != nil {
 		return nil, err
 	}
diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index 8b8909a6f8..a534a0d659 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -100,7 +100,7 @@ func NewTensor(value interface{}) (*Tensor, error) {
 		}
 	} else {
 		e := stringEncoder{offsets: buf, data: raw[nflattened*8 : len(raw)], status: newStatus()}
-		if e.encode(reflect.ValueOf(value)); err != nil {
+		if err := e.encode(reflect.ValueOf(value)); err != nil {
 			return nil, err
 		}
 		if int64(buf.Len()) != nflattened*8 {
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Tensor.java b/tensorflow/java/src/main/java/org/tensorflow/Tensor.java
index 4424100390..c5ad1ee51c 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Tensor.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Tensor.java
@@ -25,6 +25,7 @@ import java.nio.FloatBuffer;
 import java.nio.IntBuffer;
 import java.nio.LongBuffer;
 import java.util.Arrays;
+import java.util.HashMap;
 
 /**
  * A typed multi-dimensional array.
@@ -97,9 +98,19 @@ public final class Tensor implements AutoCloseable {
    *     using {@link #create(DataType, long[], ByteBuffer)} instead.
    */
   public static Tensor create(Object obj) {
+    return create(obj, dataTypeOf(obj));
+  }
+
+  /**
+   * Create a Tensor of data type {@code dtype} from a Java object.
+   *
+   * @param dtype the intended tensor data type. It must match the the run-time type of the object.
+   */
+  static Tensor create(Object obj, DataType dtype) {
     Tensor t = new Tensor();
-    t.dtype = dataTypeOf(obj);
-    t.shapeCopy = new long[numDimensions(obj)];
+    t.dtype = dtype;
+    t.shapeCopy = new long[numDimensions(obj, dtype)];
+    assert objectCompatWithType(obj, dtype);
     fillShape(obj, 0, t.shapeCopy);
     if (t.dtype != DataType.STRING) {
       int byteSize = elemByteSize(t.dtype) * numElements(t.shapeCopy);
@@ -190,8 +201,7 @@ public final class Tensor implements AutoCloseable {
    *
    * <p>Creates a Tensor with the provided shape of any type where the tensor's data has been
    * encoded into {@code data} as per the specification of the TensorFlow <a
-   * href="https://www.tensorflow.org/code/tensorflow/c/c_api.h">C
-   * API</a>.
+   * href="https://www.tensorflow.org/code/tensorflow/c/c_api.h">C API</a>.
    *
    * @param dataType the tensor datatype.
    * @param shape the tensor shape.
@@ -537,56 +547,70 @@ public final class Tensor implements AutoCloseable {
     }
   }
 
+  private static HashMap<Class<?>, DataType> classDataTypes = new HashMap<>();
+
+  static {
+    classDataTypes.put(int.class, DataType.INT32);
+    classDataTypes.put(Integer.class, DataType.INT32);
+    classDataTypes.put(long.class, DataType.INT64);
+    classDataTypes.put(Long.class, DataType.INT64);
+    classDataTypes.put(float.class, DataType.FLOAT);
+    classDataTypes.put(Float.class, DataType.FLOAT);
+    classDataTypes.put(double.class, DataType.DOUBLE);
+    classDataTypes.put(Double.class, DataType.DOUBLE);
+    classDataTypes.put(byte.class, DataType.STRING);
+    classDataTypes.put(Byte.class, DataType.STRING);
+    classDataTypes.put(boolean.class, DataType.BOOL);
+    classDataTypes.put(Boolean.class, DataType.BOOL);
+  }
+
   private static DataType dataTypeOf(Object o) {
-    if (o.getClass().isArray()) {
-      if (Array.getLength(o) == 0) {
-        throw new IllegalArgumentException("cannot create Tensors with a 0 dimension");
-      }
-      // byte[] is a DataType.STRING scalar.
-      Object e = Array.get(o, 0);
-      if (e == null) {
-        throwExceptionIfNotByteOfByteArrays(o);
-        return DataType.STRING;
-      }
-      if (Byte.class.isInstance(e) || byte.class.isInstance(e)) {
-        return DataType.STRING;
-      }
-      return dataTypeOf(e);
+    Class<?> c = o.getClass();
+    while (c.isArray()) {
+      c = c.getComponentType();
     }
-    if (Float.class.isInstance(o) || float.class.isInstance(o)) {
-      return DataType.FLOAT;
-    } else if (Double.class.isInstance(o) || double.class.isInstance(o)) {
-      return DataType.DOUBLE;
-    } else if (Integer.class.isInstance(o) || int.class.isInstance(o)) {
-      return DataType.INT32;
-    } else if (Long.class.isInstance(o) || long.class.isInstance(o)) {
-      return DataType.INT64;
-    } else if (Boolean.class.isInstance(o) || boolean.class.isInstance(o)) {
-      return DataType.BOOL;
-    } else {
-      throw new IllegalArgumentException("cannot create Tensors of " + o.getClass().getName());
+    DataType ret = classDataTypes.get(c);
+    if (ret != null) {
+      return ret;
     }
+    throw new IllegalArgumentException("cannot create Tensors of type " + c.getName());
   }
 
-  private static int numDimensions(Object o) {
-    if (o.getClass().isArray()) {
-      Object e = Array.get(o, 0);
-      if (e == null) {
-        throwExceptionIfNotByteOfByteArrays(o);
-        return 1;
-      } else if (Byte.class.isInstance(e) || byte.class.isInstance(e)) {
-        return 0;
-      }
-      return 1 + numDimensions(e);
+  /**
+   * Returns the number of dimensions of a tensor of type dtype when represented by the object o.
+   */
+  private static int numDimensions(Object o, DataType dtype) {
+    int ret = numArrayDimensions(o);
+    if (dtype == DataType.STRING && ret > 0) {
+      return ret - 1;
     }
-    return 0;
+    return ret;
   }
 
+  /** Returns the number of dimensions of the array object o. Returns 0 if o is not an array. */
+  private static int numArrayDimensions(Object o) {
+    Class<?> c = o.getClass();
+    int i = 0;
+    while (c.isArray()) {
+      c = c.getComponentType();
+      i++;
+    }
+    return i;
+  }
+
+  /**
+   * Fills in the remaining entries in the shape array starting from position {@code dim} with the
+   * dimension sizes of the multidimensional array o. Checks that all arrays reachable from o have
+   * sizes consistent with the filled-in shape, throwing IllegalArgumentException otherwise.
+   */
   private static void fillShape(Object o, int dim, long[] shape) {
     if (shape == null || dim == shape.length) {
       return;
     }
     final int len = Array.getLength(o);
+    if (len == 0) {
+      throw new IllegalArgumentException("cannot create Tensors with a 0 dimension");
+    }
     if (shape[dim] == 0) {
       shape[dim] = len;
     } else if (shape[dim] != len) {
@@ -598,15 +622,27 @@ public final class Tensor implements AutoCloseable {
     }
   }
 
+  /** Returns whether the object {@code obj} can represent a tensor with data type {@code dtype}. */
+  private static boolean objectCompatWithType(Object obj, DataType dtype) {
+    DataType dto = dataTypeOf(obj);
+    if (dto.equals(dtype)) {
+      return true;
+    }
+    if (dto == DataType.STRING && dtype == DataType.UINT8) {
+      return true;
+    }
+    return false;
+  }
+
   private void throwExceptionIfTypeIsIncompatible(Object o) {
     final int rank = numDimensions();
-    final int oRank = numDimensions(o);
+    final int oRank = numDimensions(o, dtype);
     if (oRank != rank) {
       throw new IllegalArgumentException(
           String.format(
               "cannot copy Tensor with %d dimensions into an object with %d", rank, oRank));
     }
-    if (dataTypeOf(o) != dtype) {
+    if (!objectCompatWithType(o, dtype)) {
       throw new IllegalArgumentException(
           String.format(
               "cannot copy Tensor with DataType %s into an object of type %s",
diff --git a/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java b/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
index bb5f9a0708..036db04503 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
@@ -411,6 +411,19 @@ public class TensorTest {
   }
 
   @Test
+  public void testUInt8Tensor() {
+    byte[] vector = new byte[] { 1, 2, 3, 4 };
+    try (Tensor t = Tensor.create(vector, DataType.UINT8)) {
+      assertEquals(DataType.UINT8, t.dataType());
+      assertEquals(1, t.numDimensions());
+      assertArrayEquals(new long[] {4}, t.shape());
+
+      byte[] got = t.copyTo(new byte[4]);
+      assertArrayEquals(got, vector);
+    }
+  }
+
+  @Test
   public void failCreateOnMismatchedDimensions() {
     int[][][] invalid = new int[3][1][];
     for (int x = 0; x < invalid.length; ++x) {
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 7d698c2972..759c36ad72 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -980,6 +980,8 @@ class BaseSession(SessionInterface):
       raise RuntimeError('The Session graph is empty.  Add operations to the '
                          'graph before calling run().')
 
+    if feeds is None:
+      feeds = []
     # Create request.
     feed_list = []
 
diff --git a/tensorflow/python/client/session_partial_run_test.py b/tensorflow/python/client/session_partial_run_test.py
index 33b90e6156..6ecf0fc6c7 100644
--- a/tensorflow/python/client/session_partial_run_test.py
+++ b/tensorflow/python/client/session_partial_run_test.py
@@ -196,6 +196,14 @@ class PartialRunTestMethods(object):
         'specify at least one target to fetch or execute.'):
       sess.partial_run_setup(fetches=[], feeds=[x])
 
+  def testPartialRunSetupNoFeedsPassed(self):
+    sess = session.Session()
+    r1 = constant_op.constant([6.0])
+   
+    h = sess.partial_run_setup([r1])
+    result1 = sess.partial_run(h, r1)
+    self.assertEqual([6.0], result1)
+      
   def testPartialRunDirect(self):
     self.RunTestPartialRun(session.Session())
 
diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
index daf9398f88..ea2dfac526 100644
--- a/tensorflow/python/estimator/canned/head.py
+++ b/tensorflow/python/estimator/canned/head.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
+import collections
 
 import collections
 import six
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 36e3036ee6..47bced72ab 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -146,6 +146,7 @@ class Estimator(object):
     # Model directory.
     if (model_dir is not None) and (self._config.model_dir is not None):
       if model_dir != self._config.model_dir:
+        # TODO(alanyee): remove this suppression after it is no longer needed
         # pylint: disable=g-doc-exception
         raise ValueError(
             "model_dir are set both in constructor and RunConfig, but with "
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index 704e3d251f..44c509265e 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -109,33 +109,33 @@ def convert_to_eager_tensor(t, ctx, dtype=None):
 def constant(value, dtype=None, shape=None, name="Const", verify_shape=False):
   """Creates a constant tensor.
 
-   The resulting tensor is populated with values of type `dtype`, as
-   specified by arguments `value` and (optionally) `shape` (see examples
-   below).
+  The resulting tensor is populated with values of type `dtype`, as
+  specified by arguments `value` and (optionally) `shape` (see examples
+  below).
 
-   The argument `value` can be a constant value, or a list of values of type
-   `dtype`. If `value` is a list, then the length of the list must be less
-   than or equal to the number of elements implied by the `shape` argument (if
-   specified). In the case where the list length is less than the number of
-   elements specified by `shape`, the last element in the list will be used
-   to fill the remaining entries.
+  The argument `value` can be a constant value, or a list of values of type
+  `dtype`. If `value` is a list, then the length of the list must be less
+  than or equal to the number of elements implied by the `shape` argument (if
+  specified). In the case where the list length is less than the number of
+  elements specified by `shape`, the last element in the list will be used
+  to fill the remaining entries.
 
-   The argument `shape` is optional. If present, it specifies the dimensions of
-   the resulting tensor. If not present, the shape of `value` is used.
+  The argument `shape` is optional. If present, it specifies the dimensions of
+  the resulting tensor. If not present, the shape of `value` is used.
 
-   If the argument `dtype` is not specified, then the type is inferred from
-   the type of `value`.
+  If the argument `dtype` is not specified, then the type is inferred from
+  the type of `value`.
 
-   For example:
+  For example:
 
-   ```python
-   # Constant 1-D Tensor populated with value list.
-   tensor = tf.constant([1, 2, 3, 4, 5, 6, 7]) => [1 2 3 4 5 6 7]
+  ```python
+  # Constant 1-D Tensor populated with value list.
+  tensor = tf.constant([1, 2, 3, 4, 5, 6, 7]) => [1 2 3 4 5 6 7]
 
-   # Constant 2-D tensor populated with scalar value -1.
-   tensor = tf.constant(-1.0, shape=[2, 3]) => [[-1. -1. -1.]
-                                                [-1. -1. -1.]]
-   ```
+  # Constant 2-D tensor populated with scalar value -1.
+  tensor = tf.constant(-1.0, shape=[2, 3]) => [[-1. -1. -1.]
+                                               [-1. -1. -1.]]
+  ```
 
   Args:
     value:          A constant value (or list) of output type `dtype`.
diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index 7df90ad72f..ce85747d7c 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -21,6 +21,7 @@ from __future__ import division
 from __future__ import print_function
 import copy
 import re
+import six
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import graph_pb2
@@ -123,6 +124,9 @@ def extract_sub_graph(graph_def, dest_nodes):
   if not isinstance(graph_def, graph_pb2.GraphDef):
     raise TypeError("graph_def must be a graph_pb2.GraphDef proto.")
 
+  if isinstance(dest_nodes, six.string_types):
+    raise TypeError("dest_nodes must be a list.")
+
   edges = {}  # Keyed by the dest node name.
   name_to_node_map = {}  # Keyed by node name.
 
diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py
index 647ed1583a..0421837d49 100644
--- a/tensorflow/python/framework/graph_util_test.py
+++ b/tensorflow/python/framework/graph_util_test.py
@@ -188,6 +188,13 @@ class DeviceFunctionsTest(test.TestCase):
     self.assertEqual("n3", sub_graph.node[2].name)
     self.assertEqual("n5", sub_graph.node[3].name)
 
+  def testExtractSubGraphWithInvalidDestNodes(self):
+    graph_def = graph_pb2.GraphDef()
+    n1 = graph_def.node.add()
+    n1.name = "n1"
+    with self.assertRaisesRegexp(TypeError, "must be a list"):
+      graph_util.extract_sub_graph(graph_def, "n1")
+
   def testConvertVariablesToConstsWithFunctions(self):
     @function.Defun(dtypes.float32)
     def plus_one(x):
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index c8bdb35e80..f82dae7ecc 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -626,7 +626,7 @@ def _ConstantValue(tensor, partial):
   elif tensor.op.type == "Rank":
     input_shape = tensor.op.inputs[0].get_shape()
     if input_shape.ndims is not None:
-      return np.ndarray(shape=(), buffer=np.array([input_shape.ndims]),
+      return np.ndarray(shape=(), buffer=np.array([input_shape.ndims], dtype=np.int32),
                         dtype=np.int32)
     else:
       return None
diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/_impl/keras/backend.py
index 05756024ca..f02f6d10df 100644
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/_impl/keras/backend.py
@@ -2894,7 +2894,7 @@ def elu(x, alpha=1.):
   """Exponential linear unit.
 
   Arguments:
-      x: A tenor or variable to compute the activation function for.
+      x: A tensor or variable to compute the activation function for.
       alpha: A scalar, slope of positive section.
 
   Returns:
diff --git a/tensorflow/python/kernel_tests/attention_ops_test.py b/tensorflow/python/kernel_tests/attention_ops_test.py
index f9c1727309..9e8a4f1706 100644
--- a/tensorflow/python/kernel_tests/attention_ops_test.py
+++ b/tensorflow/python/kernel_tests/attention_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.platform import test
@@ -196,6 +197,18 @@ class ExtractGlimpseTest(test.TestCase):
         expected_rows=[None, None, None, 1, 2, 3, 4],
         expected_cols=[56, 57, 58, 59, 60])
 
+  def testGlimpseNonNormalizedNonCentered(self):
+    img = constant_op.constant(np.arange(25).reshape((1, 5, 5, 1)),
+                               dtype=dtypes.float32)
+    with self.test_session():
+      result1 = image_ops.extract_glimpse(img, [3, 3], [[0, 0]],
+                                          centered=False, normalized=False)
+      result2 = image_ops.extract_glimpse(img, [3, 3], [[1, 0]],
+                                          centered=False, normalized=False)
+      self.assertAllEqual(np.asarray([[0, 1, 2], [5, 6, 7], [10, 11, 12]]),
+                          result1.eval()[0, :, :, 0])
+      self.assertAllEqual(np.asarray([[5, 6, 7], [10, 11, 12], [15, 16, 17]]),
+                          result2.eval()[0, :, :, 0])
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/in_topk_op_test.py b/tensorflow/python/kernel_tests/in_topk_op_test.py
index 37e9a8e3d1..fafeea8ec0 100644
--- a/tensorflow/python/kernel_tests/in_topk_op_test.py
+++ b/tensorflow/python/kernel_tests/in_topk_op_test.py
@@ -22,7 +22,6 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
-from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
 
@@ -77,9 +76,7 @@ class InTopKTest(test.TestCase):
     k = constant_op.constant(3)
     np_ans = np.array([False, True])
     with self.test_session():
-      # TODO (yongtang): The test will be switch to nn_ops.in_top
-      # once nn_ops.in_top points to _in_top_kv2 later
-      precision = gen_nn_ops._in_top_kv2(predictions, target, k)
+      precision = nn_ops.in_top_k(predictions, target, k)
       out = precision.eval()
       self.assertAllClose(np_ans, out)
       self.assertShapeEqual(np_ans, precision)
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index e2587a1206..d096c11f0f 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1482,7 +1482,8 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True):
     # For now, variant types must be created via zeros_like; as we need to
     # pass the input variant object to the proper zeros callback.
 
-    if tensor.shape.is_fully_defined() and tensor.dtype != dtypes.variant:
+    if optimize and tensor.shape.is_fully_defined() and \
+        tensor.dtype != dtypes.variant:
       # We can produce a zeros tensor independent of the value of 'tensor',
       # since the shape is known statically.
       return zeros(tensor.shape, dtype=dtype or tensor.dtype, name=name)
diff --git a/tensorflow/python/ops/control_flow_grad.py b/tensorflow/python/ops/control_flow_grad.py
index 496c5addad..3c082b19b6 100644
--- a/tensorflow/python/ops/control_flow_grad.py
+++ b/tensorflow/python/ops/control_flow_grad.py
@@ -117,7 +117,7 @@ def _MergeGrad(op, grad, _):
 
         # Add the stack pop op. If pred.op is in a (outer) CondContext,
         # the stack pop will be guarded with a switch.
-        real_pred = grad_state.AddBackPropAccumulatedValue(history_pred, pred)
+        real_pred = grad_state.AddBackpropAccumulatedValue(history_pred, pred)
         grad_state.history_map[pred.name] = real_pred
       pred = real_pred
     # pylint: disable=protected-access
@@ -214,9 +214,9 @@ def _EnterGrad(op, grad):
   if op.get_attr("is_constant"):
     # Add a gradient accumulator for each loop invariant.
     if isinstance(grad, ops.Tensor):
-      result = grad_ctxt.AddBackPropAccumulator(op, grad)
+      result = grad_ctxt.AddBackpropAccumulator(op, grad)
     elif isinstance(grad, ops.IndexedSlices):
-      result = grad_ctxt.AddBackPropIndexedSlicesAccumulator(op, grad)
+      result = grad_ctxt.AddBackpropIndexedSlicesAccumulator(op, grad)
     else:
       # TODO(yuanbyu, lukasr): Add support for SparseTensor.
       raise TypeError("Type %s not supported" % type(grad))
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index e8f64d5817..4b9b34b49d 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -683,7 +683,7 @@ class GradLoopState(object):
     # The while loop context for backprop.
     self._grad_context = None
 
-    # The loop counter added by AddBackPropLoopCounter. It is the value
+    # The loop counter added by AddBackpropLoopCounter. It is the value
     # of the loop counter for the current iteration.
     self._grad_index = None
 
@@ -725,8 +725,8 @@ class GradLoopState(object):
                                         forward_ctxt.swap_memory,
                                         forward_ctxt.name,
                                         self)
-      real_cnt = outer_grad_state.AddBackPropAccumulatedValue(history_cnt, cnt)
-      self._grad_index = self._grad_context.AddBackPropLoopCounter(
+      real_cnt = outer_grad_state.AddBackpropAccumulatedValue(history_cnt, cnt)
+      self._grad_index = self._grad_context.AddBackpropLoopCounter(
           real_cnt, outer_grad_state)
       outer_grad_ctxt.Exit()
     else:
@@ -736,7 +736,7 @@ class GradLoopState(object):
                                         forward_ctxt.swap_memory,
                                         forward_ctxt.name,
                                         self)
-      self._grad_index = self._grad_context.AddBackPropLoopCounter(
+      self._grad_index = self._grad_context.AddBackpropLoopCounter(
           cnt, outer_grad_state)
       if outer_forward_ctxt: outer_forward_ctxt.Exit()
 
@@ -914,7 +914,7 @@ class GradLoopState(object):
       push.op._add_control_input(add_op)
       return acc
 
-  def AddBackPropAccumulatedValue(self, history_value, value,
+  def AddBackpropAccumulatedValue(self, history_value, value,
                                   dead_branch=False):
     """Add the getter for an accumulated value in the grad context.
 
@@ -1013,7 +1013,7 @@ class GradLoopState(object):
 
       if real_value is None:
         # Add the stack pop op in the grad context.
-        real_value = cur_grad_state.AddBackPropAccumulatedValue(history_value,
+        real_value = cur_grad_state.AddBackpropAccumulatedValue(history_value,
                                                                 cur_value)
         if cur_grad_state != self:
           real_value = self._grad_context.AddValue(real_value)
@@ -1170,7 +1170,7 @@ class ControlFlowState(object):
         # Get the shape back from the stack.
         outer_grad_ctxt = outer_grad_state.grad_context
         outer_grad_ctxt.Enter()
-        real_shape = outer_grad_state.AddBackPropAccumulatedValue(
+        real_shape = outer_grad_state.AddBackpropAccumulatedValue(
             history_shape, shape)
         result = array_ops.zeros(real_shape, val.dtype)
         outer_grad_ctxt.Exit()
@@ -1240,7 +1240,7 @@ class ControlFlowState(object):
       grad_state.grad_context.Enter()
 
       # Create a zero tensor with the right shape.
-      shape = grad_state.AddBackPropAccumulatedValue(
+      shape = grad_state.AddBackpropAccumulatedValue(
           history_zeros_shape, zeros_shape, dead_branch)
       result = array_ops.zeros(shape, val.dtype)
     return result
@@ -2282,7 +2282,7 @@ class WhileContext(ControlFlowContext):
     self.Exit()
     return total_iterations, next_n
 
-  def AddBackPropLoopCounter(self, count, outer_grad_state):
+  def AddBackpropLoopCounter(self, count, outer_grad_state):
     """Add the backprop loop that controls the iterations.
 
     This is added to the backprop loop. It is used to control the loop
@@ -2336,7 +2336,7 @@ class WhileContext(ControlFlowContext):
     self.Exit()
     return next_count
 
-  def AddBackPropAccumulator(self, op, grad):
+  def AddBackpropAccumulator(self, op, grad):
     """Add an accumulation loop for every loop invariant.
 
     This is added to the backprop loop. It is used to accumulate partial
@@ -2382,7 +2382,7 @@ class WhileContext(ControlFlowContext):
         history_zeros_shape = outer_grad_state.AddForwardAccumulator(
             zeros_shape)
         self.outer_context.Enter()
-        real_shape = outer_grad_state.AddBackPropAccumulatedValue(
+        real_shape = outer_grad_state.AddBackpropAccumulatedValue(
             history_zeros_shape, zeros_shape)
         acc = array_ops.zeros(real_shape, grad.dtype)
         self.outer_context.Exit()
@@ -2412,10 +2412,10 @@ class WhileContext(ControlFlowContext):
     self.ExitResult([result_acc])
     return result_acc
 
-  def AddBackPropIndexedSlicesAccumulator(self, op, grad):
+  def AddBackpropIndexedSlicesAccumulator(self, op, grad):
     """This is used for accumulating gradients that are IndexedSlices.
 
-    This is essentially the equavalent of AddBackPropAccumulator but optimized
+    This is essentially the equivalent of AddBackpropAccumulator but optimized
     for things like updating embeddings from within a while loop.
 
     Args:
@@ -3039,7 +3039,7 @@ def case(pred_fn_pairs, default=None, exclusive=False, strict=False,
 
   If `exclusive==True`, all predicates are evaluated, and an exception is
   thrown if more than one of the predicates evaluates to `True`.
-  If `exclusive==False`, execution stops are the first predicate which
+  If `exclusive==False`, execution stops at the first predicate which
   evaluates to True, and the tensors generated by the corresponding function
   are returned immediately. If none of the predicates evaluate to True, this
   operation returns the tensors generated by `default`.
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index ebdf5ea09e..46e2d2458a 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -397,9 +397,10 @@ def central_crop(image, central_fraction):
 
   img_shape = array_ops.shape(image)
   depth = image.get_shape()[2]
-  fraction_offset = int(1 / ((1 - central_fraction) / 2.0))
-  bbox_h_start = math_ops.div(img_shape[0], fraction_offset)
-  bbox_w_start = math_ops.div(img_shape[1], fraction_offset)
+  img_h = math_ops.to_double(img_shape[0])
+  img_w = math_ops.to_double(img_shape[1])
+  bbox_h_start = math_ops.to_int32((img_h - img_h * central_fraction) / 2)
+  bbox_w_start = math_ops.to_int32((img_w - img_w * central_fraction) / 2)
 
   bbox_h_size = img_shape[0] - bbox_h_start * 2
   bbox_w_size = img_shape[1] - bbox_w_start * 2
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 180e510669..0e6f313af7 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -1260,6 +1260,19 @@ class CentralCropTest(test_util.TensorFlowTestCase):
       y = image_ops.central_crop(x, 0.5)
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
+      self.assertAllEqual(y_tf.shape, y_np.shape)
+
+  def testCropping2(self):
+    # Test case for 10315
+    x_shape = [240, 320, 3]
+    x_np = np.zeros(x_shape, dtype=np.int32)
+    y_np = np.zeros([80, 106, 3], dtype=np.int32)
+    with self.test_session(use_gpu=True):
+      x = array_ops.placeholder(shape=x_shape, dtype=dtypes.int32)
+      y = image_ops.central_crop(x, 0.33)
+      y_tf = y.eval(feed_dict={x:x_np})
+      self.assertAllEqual(y_tf, y_np)
+      self.assertAllEqual(y_tf.shape, y_np.shape)
 
   def testShapeInference(self):
     # Test no-op fraction=1.0
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index f9b1733dda..bbfa38aa17 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -40,8 +40,7 @@ from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated
 
 
-# TODO(yleon): Remove this function.
-@deprecated("2017-03-02", "Use `tf.tables_initializer` instead.")
+@deprecated(None, "Use `tf.tables_initializer` instead.")
 def initialize_all_tables(name="init_all_tables"):
   """Returns an Op that initializes all tables of the default graph.
 
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index 48d1d5b25a..1c1554e9f3 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -198,8 +198,7 @@ class BatchNormalizationTest(test.TestCase):
         epsilon = y.op.get_attr('epsilon')
         data_format = y.op.get_attr('data_format')
         grad_vals = sess.run([grad_x, grad_scale, grad_offset])
-        grad_internal = nn_grad._BatchNormGrad(grad_y, x, scale, epsilon,
-                                               data_format)
+        grad_internal = nn_grad._BatchNormGrad(grad_y, x, scale, pop_mean, pop_var, epsilon, data_format)
         grad_internal_vals = sess.run(list(grad_internal))
         for grad_val, grad_internal_val in zip(grad_vals, grad_internal_vals):
           self.assertAllClose(grad_val, grad_internal_val, atol=err_tolerance)
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 54627b6fd9..c3e8d403ba 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -736,64 +736,85 @@ def _FusedBatchNormGrad(op, *grad):
   else:
     pop_mean = op.inputs[3]
     pop_var = op.inputs[4]
-    if data_format == b"NHWC":
-      reduce_axis = [0, 1, 2]
-    else:
-      reduce_axis = [0, 2, 3]
-      shape = [1, array_ops.size(pop_mean), 1, 1]
-      pop_mean = array_ops.reshape(pop_mean, shape)
-      pop_var = array_ops.reshape(pop_var, shape)
-      scale = array_ops.reshape(scale, shape)
-
-    grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
-    var_rsqrt = math_ops.rsqrt(pop_var + epsilon)
-    grad_scale = math_ops.reduce_sum(
-        grad_y * (x - pop_mean) * var_rsqrt, axis=reduce_axis)
-    grad_x = grad_y * scale * var_rsqrt
-    return grad_x, grad_scale, grad_offset, None, None
+    if data_format == b"NCHW":
+      x = array_ops.transpose(x, [0, 2, 3, 1])
+      grad_y = array_ops.transpose(grad_y, [0, 2, 3, 1])
+    dx, dscale, doffset, _, _ = gen_nn_ops.fused_batch_norm_grad(
+        grad_y,
+        x,
+        scale,
+        pop_mean,
+        pop_var,
+        epsilon=epsilon,
+        data_format='NHWC',
+        is_training=is_training)
+    if data_format == b"NCHW":
+      dx = array_ops.transpose(dx, [0, 3, 1, 2])
+    return dx, dscale, doffset, None, None
 
 
-def _BatchNormGrad(grad_y, x, scale, epsilon, data_format):
+def _BatchNormGrad(grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training=True):
   """Returns the gradients for the 3 inputs of BatchNorm.
 
   Args:
     grad_y: A `Tensor` of 4 dimensions for gradient for y.
     x: A `Tensor` of 4 dimensions for x.
     scale: A `Tensor` of 1 dimension for scaling.
+    pop_mean: A `Tensor` of 1 dimension for the population mean. Only used when is_training=False.
+    pop_var: A `Tensor` of 1 dimension for the population variance. Only used when is_training=False.
     epsilon: A small float number added to the variance of x.
     data_format: The data format for input. Either b"NHWC" or b"NCHW".
+    is_training: A bool value to indicate the operation is for training (default)
+        or inference.
 
   Returns:
     A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient
     for x, grad_scale the gradient for scale, and grad_offset the gradient
     for offset.
   """
-  if data_format == b"NHWC":
-    keep_dims = False
-    reduce_axis = [0, 1, 2]
+  if is_training:
+    if data_format == b"NHWC":
+      keep_dims = False
+      reduce_axis = [0, 1, 2]
+    else:
+      keep_dims = True
+      reduce_axis = [0, 2, 3]
+      shape = [1, array_ops.size(scale), 1, 1]
+      scale = array_ops.reshape(scale, shape)
+    mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keep_dims=keep_dims)
+    mean_x = math_ops.reduce_mean(x, reduce_axis, keep_dims=keep_dims)
+    var_x = math_ops.reduce_mean(
+        math_ops.squared_difference(x, array_ops.stop_gradient(mean_x)),
+        reduce_axis,
+        keep_dims=keep_dims)
+    grad_y_offset = grad_y - mean_grad_y
+    x_offset = x - mean_x
+    mean = math_ops.reduce_mean(
+        grad_y * x_offset, axis=reduce_axis, keep_dims=keep_dims)
+    grad_x = scale * math_ops.rsqrt(var_x + epsilon) * (
+        grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset)
+    grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum(
+        grad_y * x_offset, axis=reduce_axis, keep_dims=keep_dims)
+    if data_format == b"NCHW":
+      grad_scale = array_ops.squeeze(grad_scale)
+    grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
+    return grad_x, grad_scale, grad_offset
   else:
-    keep_dims = True
-    reduce_axis = [0, 2, 3]
-    shape = [1, array_ops.size(scale), 1, 1]
-    scale = array_ops.reshape(scale, shape)
-  mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keep_dims=keep_dims)
-  mean_x = math_ops.reduce_mean(x, reduce_axis, keep_dims=keep_dims)
-  var_x = math_ops.reduce_mean(
-      math_ops.squared_difference(x, array_ops.stop_gradient(mean_x)),
-      reduce_axis,
-      keep_dims=keep_dims)
-  grad_y_offset = grad_y - mean_grad_y
-  x_offset = x - mean_x
-  mean = math_ops.reduce_mean(
-      grad_y * x_offset, axis=reduce_axis, keep_dims=keep_dims)
-  grad_x = scale * math_ops.rsqrt(var_x + epsilon) * (
-      grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset)
-  grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum(
-      grad_y * x_offset, axis=reduce_axis, keep_dims=keep_dims)
-  if data_format == b"NCHW":
-    grad_scale = array_ops.squeeze(grad_scale)
-  grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
-  return grad_x, grad_scale, grad_offset
+    if data_format == b"NHWC":
+      reduce_axis = [0, 1, 2]
+    else:
+      reduce_axis = [0, 2, 3]
+      shape = [1, array_ops.size(pop_mean), 1, 1]
+      pop_mean = array_ops.reshape(pop_mean, shape)
+      pop_var = array_ops.reshape(pop_var, shape)
+      scale = array_ops.reshape(scale, shape)
+
+    grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
+    var_rsqrt = math_ops.rsqrt(pop_var + epsilon)
+    grad_scale = math_ops.reduce_sum(
+        grad_y * (x - pop_mean) * var_rsqrt, axis=reduce_axis)
+    grad_x = grad_y * scale * var_rsqrt
+    return grad_x, grad_scale, grad_offset
 
 
 @ops.RegisterGradient("FusedBatchNormGrad")
@@ -813,14 +834,17 @@ def _FusedBatchNormGradGrad(op, *grad):
   """
   data_format = op.get_attr("data_format")
   epsilon = op.get_attr("epsilon")
+  is_training = op.get_attr("is_training")
   grad_y = op.inputs[0]
   x = op.inputs[1]
   scale = op.inputs[2]
+  pop_mean = op.inputs[3]
+  pop_var = op.inputs[4]
   grad_grad_x = grad[0]
   grad_grad_scale = grad[1]
   grad_grad_offset = grad[2]
-  grad_x, grad_scale, grad_offset = _BatchNormGrad(grad_y, x, scale, epsilon,
-                                                   data_format)
+  grad_x, grad_scale, grad_offset = _BatchNormGrad(
+        grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training)
   grad_initial = [grad_grad_x, grad_grad_scale, grad_grad_offset]
   grad_grad_y, grad_x, grad_scale = gradients_impl.gradients(
       [grad_x, grad_scale, grad_offset], [grad_y, x, scale], grad_initial)
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 67b490fcfb..bd726ca631 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1022,8 +1022,8 @@ def conv2d_transpose(value,
     axis = 3 if data_format == "NHWC" else 1
     if not value.get_shape()[axis].is_compatible_with(filter.get_shape()[3]):
       raise ValueError("input channels does not match filter's input channels, "
-                       "{} != {}".format(value.get_shape()[3], filter.get_shape(
-                       )[3]))
+                       "{} != {}".format(value.get_shape()[axis],
+                                         filter.get_shape()[3]))
 
     output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
     if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(4)):
@@ -2145,5 +2145,4 @@ def in_top_k(predictions, targets, k, name=None):
     A `Tensor` of type `bool`. Computed Precision at `k` as a `bool Tensor`.
   """
   with ops.name_scope(name, 'in_top_k'):
-    # TODO (yongtang): Need to switch to v2 after 3 weeks.
-    return gen_nn_ops._in_top_k(predictions, targets, k, name=name)
+    return gen_nn_ops._in_top_kv2(predictions, targets, k, name=name)
diff --git a/tensorflow/python/ops/summary_op_util.py b/tensorflow/python/ops/summary_op_util.py
index 06ea63704d..37b80d5e20 100644
--- a/tensorflow/python/ops/summary_op_util.py
+++ b/tensorflow/python/ops/summary_op_util.py
@@ -94,7 +94,7 @@ def summary_scope(name, family=None, default_name=None, values=None):
   family = clean_tag(family)
   # Use family name in the scope to ensure uniqueness of scope/tag.
   scope_base_name = name if family is None else '{}/{}'.format(family, name)
-  with ops.name_scope(scope_base_name, default_name, values=values) as scope:
+  with ops.name_scope(scope_base_name, default_name, values) as scope:
     if family is None:
       tag = scope.rstrip('/')
     else:
diff --git a/tensorflow/python/platform/base.i b/tensorflow/python/platform/base.i
index aa33eb52c9..dbefca2be9 100644
--- a/tensorflow/python/platform/base.i
+++ b/tensorflow/python/platform/base.i
@@ -106,6 +106,10 @@ limitations under the License.
   $1 = &temp;
 }
 
+%typemap(out) int64_t {
+  $result = PyLong_FromLongLong($1);
+}
+
 %typemap(out) string {
   $result = PyBytes_FromStringAndSize($1.data(), $1.size());
 }
diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index 3074281feb..bb9e26d8b4 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@@ -260,8 +260,12 @@ def polynomial_decay(learning_rate, global_step, decay_steps,
     power = math_ops.cast(power, dtype)
     if cycle:
       # Find the first multiple of decay_steps that is bigger than global_step.
-      decay_steps = math_ops.multiply(decay_steps,
-                                      math_ops.ceil(global_step / decay_steps))
+      # If global_step is zero set the multiplier to 1
+      multiplier = control_flow_ops.cond(math_ops.equal(global_step, 0),
+                                         lambda: 1.0,
+                                         lambda: math_ops.ceil(
+                                             global_step / decay_steps))
+      decay_steps = math_ops.multiply(decay_steps, multiplier)
     else:
       # Make sure that the global_step used is not bigger than decay_steps.
       global_step = math_ops.minimum(global_step, decay_steps)
diff --git a/tensorflow/python/training/learning_rate_decay_test.py b/tensorflow/python/training/learning_rate_decay_test.py
index f61ec2ff68..77da3099fe 100644
--- a/tensorflow/python/training/learning_rate_decay_test.py
+++ b/tensorflow/python/training/learning_rate_decay_test.py
@@ -245,6 +245,18 @@ class SqrtDecayTest(test_util.TensorFlowTestCase):
       expected = (lr - end_lr) * 0.25 ** power + end_lr
       self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
 
+class PolynomialDecayTest(test_util.TensorFlowTestCase):
+
+  def testBeginWithCycle(self):
+    with self.test_session():
+      lr = 0.001
+      decay_steps = 10
+      step = 0
+      decayed_lr = learning_rate_decay.polynomial_decay(lr, step,
+                                                        decay_steps, cycle=True)
+      expected = lr
+      self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+
 
 class ExponentialDecayTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/util/tf_decorator.py b/tensorflow/python/util/tf_decorator.py
index 8b4d22f427..b9cc1925fa 100644
--- a/tensorflow/python/util/tf_decorator.py
+++ b/tensorflow/python/util/tf_decorator.py
@@ -83,7 +83,8 @@ def make_decorator(target,
     The `decorator_func` argument with new metadata attached.
   """
   if decorator_name is None:
-    decorator_name = _inspect.stack()[1][3]  # Caller's name.
+    prev_frame = _inspect.currentframe().f_back
+    decorator_name = _inspect.getframeinfo(prev_frame)[2]  # Caller's name.
   decorator = TFDecorator(decorator_name, target, decorator_doc,
                           decorator_argspec)
   setattr(decorator_func, '_tf_decorator', decorator)
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index ff15c1589d..1865240014 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -71,6 +71,7 @@ cc_library(
     deps = [
         ":stream_executor",
         "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:ops_util",
         "@local_config_cuda//cuda:cuda_headers",
     ] + if_cuda_is_configured([
         "//tensorflow/core:cuda",
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 08faeefe74..087ae556e7 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
@@ -231,6 +232,7 @@ CUDNN_DNN_ROUTINE_EACH_R3(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
   __macro(cudnnRNNBackwardData)                               \
   __macro(cudnnRNNBackwardWeights)                            \
   __macro(cudnnSetRNNDescriptor)                              \
+  __macro(cudnnSetRNNDescriptor_v6)                           \
   __macro(cudnnGetFilterNdDescriptor)
 
 // clang-format on
@@ -250,6 +252,17 @@ CUDNN_DNN_ROUTINE_EACH_R6(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 #undef CUDNN_DNN_ROUTINE_EACH_R6
 #endif
 
+// APIs in R7
+// clang-format off
+#if CUDNN_VERSION >= 7000
+#define CUDNN_DNN_ROUTINE_EACH_R7(__macro)                    \
+  __macro(cudnnSetConvolutionMathType)
+
+// clang-format on
+CUDNN_DNN_ROUTINE_EACH_R7(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+#undef CUDNN_DNN_ROUTINE_EACH_R7
+#endif
+
 #undef CUDNN_DNN_ROUTINE_EACH
 
 }  // namespace wrap
@@ -260,8 +273,9 @@ cudnnHandle_t ToHandle(void* opaque_handle) {
   return static_cast<cudnnHandle_t>(opaque_handle);
 }
 
-cudnnConvolutionFwdAlgo_t ToConvForwardAlgo(dnn::AlgorithmType algorithm) {
-  cudnnConvolutionFwdAlgo_t algo = cudnnConvolutionFwdAlgo_t(algorithm);
+cudnnConvolutionFwdAlgo_t ToConvForwardAlgo(dnn::AlgorithmDesc algorithm) {
+  cudnnConvolutionFwdAlgo_t algo =
+      cudnnConvolutionFwdAlgo_t(algorithm.algo_id());
   switch (algo) {
     case CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM:
     case CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM:
@@ -278,13 +292,14 @@ cudnnConvolutionFwdAlgo_t ToConvForwardAlgo(dnn::AlgorithmType algorithm) {
       return algo;
     default:
       LOG(FATAL) << "Unsupported Cudnn convolution forward algorithm: "
-                 << algorithm;
+                 << algorithm.algo_id();
   }
 }
 
 cudnnConvolutionBwdDataAlgo_t ToConvBackwardDataAlgo(
-    dnn::AlgorithmType algorithm) {
-  cudnnConvolutionBwdDataAlgo_t algo = cudnnConvolutionBwdDataAlgo_t(algorithm);
+    dnn::AlgorithmDesc algorithm) {
+  cudnnConvolutionBwdDataAlgo_t algo =
+      cudnnConvolutionBwdDataAlgo_t(algorithm.algo_id());
   switch (algo) {
     case CUDNN_CONVOLUTION_BWD_DATA_ALGO_0:
     case CUDNN_CONVOLUTION_BWD_DATA_ALGO_1:
@@ -300,14 +315,14 @@ cudnnConvolutionBwdDataAlgo_t ToConvBackwardDataAlgo(
     default:
       LOG(FATAL)
           << "Unsupported Cudnn convolution backward algorithm for data: "
-          << algorithm;
+          << algorithm.algo_id();
   }
 }
 
 cudnnConvolutionBwdFilterAlgo_t ToConvBackwardFilterAlgo(
-    dnn::AlgorithmType algorithm) {
+    dnn::AlgorithmDesc algorithm) {
   cudnnConvolutionBwdFilterAlgo_t algo =
-      cudnnConvolutionBwdFilterAlgo_t(algorithm);
+      cudnnConvolutionBwdFilterAlgo_t(algorithm.algo_id());
   switch (algo) {
     case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0:
     case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1:
@@ -322,7 +337,7 @@ cudnnConvolutionBwdFilterAlgo_t ToConvBackwardFilterAlgo(
     default:
       LOG(FATAL)
           << "Unsupported Cudnn convolution backward algorithm for filter: "
-          << algorithm;
+          << algorithm.algo_id();
   }
 }
 
@@ -541,6 +556,17 @@ class ScopedFilterDescriptor {
   SE_DISALLOW_COPY_AND_ASSIGN(ScopedFilterDescriptor);
 };
 
+// A helper function to decide whether to enable the TENSOR_OP_MATH math type
+static bool TensorOpMathEnabled() {
+  static bool is_enabled = [] {
+    bool ret;
+    TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DISABLE_TENSOR_OP_MATH",
+                                               /*default=*/false, &ret));
+    return ret;
+  }();
+  return is_enabled;
+}
+
 // Turns a ConvolutionDescriptor structure into a cudnn convolution handle
 // within a scope.
 class ScopedConvolutionDescriptor {
@@ -583,6 +609,24 @@ class ScopedConvolutionDescriptor {
       LOG(FATAL) << "could not set cudnn convolution descriptor: "
                  << ToString(status);
     }
+    // NOTE(benbarsdell): This only applies if tensor op math is enabled
+    //                      and algo selection is set to Default.
+    this->set_use_tensor_op_math(true);
+  }
+
+  void set_use_tensor_op_math(bool use_tensor_op_math) {
+#if CUDNN_VERSION >= 7000
+    cudnnMathType_t math_type =
+        (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
+    if (TensorOpMathEnabled()) {
+      cudnnStatus_t status =
+          wrap::cudnnSetConvolutionMathType(parent_, handle_, math_type);
+      if (status != CUDNN_STATUS_SUCCESS) {
+        LOG(FATAL) << "could not set cudnn convolution math type: "
+                   << ToString(status);
+      }
+    }
+#endif
   }
 
   ~ScopedConvolutionDescriptor() {
@@ -1010,11 +1054,21 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
     // Create the RNN handle
     cudnnStatus_t status = wrap::cudnnCreateRNNDescriptor(parent_, &rnn_desc_);
     CUDNN_RETURN_IF_FAIL(status, "Unable to create RNN descriptor");
+#if CUDNN_VERSION >= 6000
+    // TODO: allow the user to choose an algorithm.
+    cudnnRNNAlgo_t rnn_algo = CUDNN_RNN_ALGO_STANDARD;
+    status = wrap::cudnnSetRNNDescriptor_v6(
+        parent, cudnn_handle, rnn_desc_ /*rnnDesc*/, hidden_size /*hiddenSize*/,
+        num_layers /*numLayers*/, dropout_handle() /*dropoutDesc*/,
+        input_mode /*inputMode*/, direction_mode /*direction*/,
+        rnn_mode /*mode*/, rnn_algo /*algo*/, data_type /*dataType*/);
+#else
     status = wrap::cudnnSetRNNDescriptor(
         parent, rnn_desc_ /*rnnDesc*/, hidden_size /*hiddenSize*/,
         num_layers /*numLayers*/, dropout_handle() /*dropoutDesc*/,
         input_mode /*inputMode*/, direction_mode /*direction*/,
         rnn_mode /*mode*/, data_type /*dataType*/);
+#endif
     CUDNN_RETURN_IF_FAIL(status, "Unable to update RNN descriptor");
 
     // Create the params handle.
@@ -1943,7 +1997,7 @@ inline cudnnConvolutionFwdAlgo_t GetCudnnConvolutionForwardAlgo(
   return algo_to_use;
 }
 
-dnn::AlgorithmType GetCudnnConvolutionForwardAlgorithm(
+dnn::AlgorithmDesc GetCudnnConvolutionForwardAlgorithm(
     Stream* stream, CUDAExecutor* parent, void* dnn_handle,
     int cudnn_type,  // Actually cudnnDataType_t.
     const dnn::AlgorithmConfig& algorithm_config, bool is_profiling,
@@ -1952,13 +2006,18 @@ dnn::AlgorithmType GetCudnnConvolutionForwardAlgorithm(
     const ScopedConvolutionDescriptor& conv,
     const ScopedTensorDescriptor& output_nd,
     ScratchAllocator* scratch_allocator, DeviceMemory<uint8>* scratch) {
-  cudnnConvolutionFwdAlgo_t algo =
-      (algorithm_config.algorithm() == dnn::kDefaultAlgorithm)
-          ? GetCudnnConvolutionForwardAlgo(
-                stream, parent, dnn_handle, input_nd, filter, conv, output_nd,
-                /*specify_workspace_limit=*/scratch_allocator != nullptr,
-                scratch_allocator)
-          : ToConvForwardAlgo(algorithm_config.algorithm());
+  cudnnConvolutionFwdAlgo_t algo;
+  bool use_tensor_ops;
+  if (algorithm_config.algorithm().is_default()) {
+    use_tensor_ops = true;
+    algo = GetCudnnConvolutionForwardAlgo(
+        stream, parent, dnn_handle, input_nd, filter, conv, output_nd,
+        /*specify_workspace_limit=*/scratch_allocator != nullptr,
+        scratch_allocator);
+  } else {
+    use_tensor_ops = algorithm_config.algorithm().tensor_ops_enabled();
+    algo = ToConvForwardAlgo(algorithm_config.algorithm());
+  }
   size_t size_in_bytes;
   auto status = wrap::cudnnGetConvolutionForwardWorkspaceSize(
       parent, ToHandle(dnn_handle), /*srcDesc=*/input_nd.handle(),
@@ -1969,16 +2028,16 @@ dnn::AlgorithmType GetCudnnConvolutionForwardAlgorithm(
   if (TF_PREDICT_FALSE(status != CUDNN_STATUS_SUCCESS)) {
     CHECK(is_profiling) << "Cannot query the size of workspace needed "
                            "for the specified algorithm: "
-                        << algorithm_config.algorithm() << " "
+                        << algorithm_config.algorithm().algo_id() << " "
                         << ToString(status);
     // Silently return when we are profiling.
-    return dnn::kNoSuitableAlgorithmFound;
+    return dnn::AlgorithmDesc();
   }
   if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) {
     LOG(WARNING) << "cudnnGetConvolutionForwardWorkspaceSize() returned "
                     "negative sizeInBytes value. This could be a cudnn bug.";
     if (TF_PREDICT_TRUE(is_profiling)) {
-      return dnn::kNoSuitableAlgorithmFound;
+      return dnn::AlgorithmDesc();
     }
   } else if (size_in_bytes_int64 > 0) {
     port::StatusOr<DeviceMemory<uint8>> allocated;
@@ -1989,26 +2048,30 @@ dnn::AlgorithmType GetCudnnConvolutionForwardAlgorithm(
       } else {
         if (TF_PREDICT_TRUE(is_profiling)) {
           // Silently return when we are profiling.
-          return dnn::kNoSuitableAlgorithmFound;
+          return dnn::AlgorithmDesc();
         }
         LOG(WARNING) << allocated.status().error_message();
         // For the int8 case, we fail at this point since the no_scratch
         // algorithm should be set to dnn::kDefaultAlgorithm.
-        CHECK(algorithm_config.algorithm_no_scratch() != dnn::kDefaultAlgorithm)
+        CHECK(!algorithm_config.algorithm_no_scratch().is_default())
             << "The primary convolution algorithm failed memory allocation, "
                "while a secondary algorithm is not provided.";
       }
     }
     if (TF_PREDICT_FALSE(!allocated.ok())) {
-      algo = (algorithm_config.algorithm_no_scratch() == dnn::kDefaultAlgorithm)
-                 ? GetCudnnConvolutionForwardAlgo(
-                       stream, parent, dnn_handle, input_nd, filter, conv,
-                       output_nd, /*specify_workspace_limit=*/false, nullptr)
-                 : ToConvForwardAlgo(algorithm_config.algorithm_no_scratch());
+      if (algorithm_config.algorithm_no_scratch().is_default()) {
+        use_tensor_ops = true;
+        algo = GetCudnnConvolutionForwardAlgo(
+            stream, parent, dnn_handle, input_nd, filter, conv, output_nd,
+            /*specify_workspace_limit=*/false, nullptr);
+      } else {
+        use_tensor_ops = algorithm_config.algorithm().tensor_ops_enabled();
+        algo = ToConvForwardAlgo(algorithm_config.algorithm_no_scratch());
+      }
     }
   }
 
-  return algo;
+  return dnn::AlgorithmDesc(algo, use_tensor_ops);
 }
 
 }  // namespace
@@ -2050,11 +2113,12 @@ bool CudnnSupport::DoConvolveImpl(
 
   const bool is_profiling = output_profile_result != nullptr;
   cudnnConvolutionFwdAlgo_t algo;
+  bool use_tensor_ops;
   DeviceMemory<uint8> scratch;
 
   // TODO(pauldonnelly): Replace the following code with a call to
   //   GetCudnnConvolutionForwardAlgorithm().
-  if (algorithm_config.algorithm() == dnn::kDefaultAlgorithm) {
+  if (algorithm_config.algorithm().is_default()) {
     // With the default algorithm, use Cudnn's heuristics.
     auto get_algorithm =
         [&](bool specify_limit) SHARED_LOCKS_REQUIRED(dnn_handle_mutex_) {
@@ -2085,6 +2149,7 @@ bool CudnnSupport::DoConvolveImpl(
         };
 
     algo = get_algorithm(/*specify_limit=*/scratch_allocator != nullptr);
+    use_tensor_ops = true;
     if (scratch_allocator != nullptr) {
       size_t size_in_bytes;
       status = wrap::cudnnGetConvolutionForwardWorkspaceSize(
@@ -2117,7 +2182,10 @@ bool CudnnSupport::DoConvolveImpl(
     }
   } else {
     // An algorithm has been specified.
-    algo = ToConvForwardAlgo(algorithm_config.algorithm());
+    dnn::AlgorithmDesc algotype = algorithm_config.algorithm();
+    algo = ToConvForwardAlgo(algotype);
+    use_tensor_ops = algotype.tensor_ops_enabled();
+    conv.set_use_tensor_op_math(use_tensor_ops);
     size_t size_in_bytes;
     status = wrap::cudnnGetConvolutionForwardWorkspaceSize(
         parent_, ToHandle(dnn_handle_), /*srcDesc=*/input_nd.handle(),
@@ -2131,7 +2199,7 @@ bool CudnnSupport::DoConvolveImpl(
       }
       LOG(FATAL) << "Cannot query the size of workspace needed for the given "
                     "algorithm: "
-                 << algorithm_config.algorithm();
+                 << algorithm_config.algorithm().algo_id();
     }
     int64 size_in_bytes_int64 = size_in_bytes;
     if (size_in_bytes_int64 > 0) {
@@ -2150,10 +2218,13 @@ bool CudnnSupport::DoConvolveImpl(
         LOG(WARNING) << allocated.status().error_message();
       }
       if (scratch == nullptr) {
-        CHECK(algorithm_config.algorithm_no_scratch() != dnn::kDefaultAlgorithm)
+        CHECK(!algorithm_config.algorithm_no_scratch().is_default())
             << "The primary convolution algorithm failed memory allocation, "
                "while a secondary algorithm is not provided.";
-        algo = ToConvForwardAlgo(algorithm_config.algorithm_no_scratch());
+        dnn::AlgorithmDesc algotype = algorithm_config.algorithm_no_scratch();
+        algo = ToConvForwardAlgo(algotype);
+        use_tensor_ops = algotype.tensor_ops_enabled();
+        conv.set_use_tensor_op_math(use_tensor_ops);
       }
     } else if (size_in_bytes_int64 < 0) {
       LOG(WARNING) << "cudnnGetConvolutionForwardWorkspaceSize() returned "
@@ -2189,7 +2260,8 @@ bool CudnnSupport::DoConvolveImpl(
       return false;
     }
     if (status == CUDNN_STATUS_SUCCESS) {
-      output_profile_result->set_algorithm(algo);
+      dnn::AlgorithmDesc algotype(algo, use_tensor_ops);
+      output_profile_result->set_algorithm(algotype);
       output_profile_result->set_elapsed_time_in_ms(
           timer->GetElapsedMilliseconds());
     }
@@ -2250,17 +2322,18 @@ bool CudnnSupport::DoFusedConvolveImpl(
 
   const bool is_profiling = output_profile_result != nullptr;
   DeviceMemory<uint8> scratch;
-  dnn::AlgorithmType algorithm_type = GetCudnnConvolutionForwardAlgorithm(
+  dnn::AlgorithmDesc algotype = GetCudnnConvolutionForwardAlgorithm(
       stream, parent_, dnn_handle_, cudnn_data_type, algorithm_config,
       is_profiling, conv_input_nd, filter, conv, output_nd, scratch_allocator,
       &scratch);
-  if (algorithm_type == dnn::kNoSuitableAlgorithmFound) {
+  if (algotype.is_default()) {
     if (!is_profiling) {
       LOG(ERROR) << "No suitable algorithm found";
     }
     return false;
   }
-  auto algo = static_cast<cudnnConvolutionFwdAlgo_t>(algorithm_type);
+  auto algo = static_cast<cudnnConvolutionFwdAlgo_t>(algotype.algo_id());
+  conv.set_use_tensor_op_math(algotype.tensor_ops_enabled());
 
   if (activation_mode != dnn::ActivationMode::kRelu) {
     LOG(ERROR) << "cudnnConvolutionBiasActivationForward() only supports Relu "
@@ -2326,7 +2399,7 @@ bool CudnnSupport::DoFusedConvolveImpl(
       return false;
     }
     if (status == CUDNN_STATUS_SUCCESS) {
-      output_profile_result->set_algorithm(algo);
+      output_profile_result->set_algorithm(algotype);
       output_profile_result->set_elapsed_time_in_ms(
           timer->GetElapsedMilliseconds());
     }
@@ -2397,7 +2470,7 @@ struct WinogradNonfused {
 
 bool CudnnSupport::GetConvolveAlgorithms(
     bool with_winograd_nonfused,
-    std::vector<dnn::AlgorithmType>* out_algorithms) {
+    std::vector<dnn::AlgorithmDesc::Index>* out_algorithms) {
   out_algorithms->assign({
       // clang-format off
       CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
@@ -2423,7 +2496,7 @@ bool CudnnSupport::GetConvolveAlgorithms(
 
 bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
     bool with_winograd_nonfused,
-    std::vector<dnn::AlgorithmType>* out_algorithms) {
+    std::vector<dnn::AlgorithmDesc::Index>* out_algorithms) {
   out_algorithms->assign({
       // clang-format off
       CUDNN_CONVOLUTION_BWD_DATA_ALGO_0,
@@ -2446,7 +2519,7 @@ bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
 
 bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
     bool with_winograd_nonfused,
-    std::vector<dnn::AlgorithmType>* out_algorithms) {
+    std::vector<dnn::AlgorithmDesc::Index>* out_algorithms) {
   out_algorithms->assign({
       // clang-format off
       CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0,
@@ -2858,7 +2931,7 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
   cudnnConvolutionBwdDataAlgo_t algo;
   DeviceMemory<uint8> scratch;
 
-  if (algorithm_config.algorithm() == dnn::kDefaultAlgorithm) {
+  if (algorithm_config.algorithm().is_default()) {
     // With the default algorithm, use Cudnn's heuristics.
     auto get_algorithm = [&](bool specify_limit) SHARED_LOCKS_REQUIRED(
         dnn_handle_mutex_) -> cudnnConvolutionBwdDataAlgo_t {
@@ -2927,7 +3000,9 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
     }
   } else {
     // An algorithm has been specified.
-    algo = ToConvBackwardDataAlgo(algorithm_config.algorithm());
+    dnn::AlgorithmDesc algotype = algorithm_config.algorithm();
+    algo = ToConvBackwardDataAlgo(algotype);
+    conv.set_use_tensor_op_math(algotype.tensor_ops_enabled());
     size_t size_in_bytes;
     status = wrap::cudnnGetConvolutionBackwardDataWorkspaceSize(
         parent_, ToHandle(dnn_handle_),
@@ -2944,7 +3019,7 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
       }
       LOG(FATAL) << "Cannot query the size of workspace needed for the given "
                     "algorithm: "
-                 << algorithm_config.algorithm();
+                 << algorithm_config.algorithm().algo_id();
     }
     int64 size_in_bytes_int64 = size_in_bytes;
     if (size_in_bytes_int64 > 0) {
@@ -2963,10 +3038,12 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
         LOG(WARNING) << allocated.status().error_message();
       }
       if (scratch == nullptr) {
-        CHECK(algorithm_config.algorithm_no_scratch() != dnn::kDefaultAlgorithm)
+        CHECK(!algorithm_config.algorithm_no_scratch().is_default())
             << "The primary convolution algorithm failed memory allocation, "
                "while a secondary algorithm is not provided.";
-        algo = ToConvBackwardDataAlgo(algorithm_config.algorithm_no_scratch());
+        dnn::AlgorithmDesc algotype = algorithm_config.algorithm_no_scratch();
+        algo = ToConvBackwardDataAlgo(algotype);
+        conv.set_use_tensor_op_math(algotype.tensor_ops_enabled());
       }
     } else if (size_in_bytes_int64 < 0) {
       LOG(WARNING) << "cudnnGetConvolutionBackwardDataWorkspaceSize() returned "
@@ -3005,7 +3082,9 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
   if (is_profiling) {
     timer->Stop(AsCUDAStream(stream));
     if (status == CUDNN_STATUS_SUCCESS) {
-      output_profile_result->set_algorithm(algo);
+      bool use_tensor_ops = algorithm_config.algorithm().tensor_ops_enabled();
+      dnn::AlgorithmDesc algotype(algo, use_tensor_ops);
+      output_profile_result->set_algorithm(algotype);
       output_profile_result->set_elapsed_time_in_ms(
           timer->GetElapsedMilliseconds());
     }
@@ -3108,7 +3187,7 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
   cudnnConvolutionBwdFilterAlgo_t algo;
   DeviceMemory<uint8> scratch;
 
-  if (algorithm_config.algorithm() == dnn::kDefaultAlgorithm) {
+  if (algorithm_config.algorithm().is_default()) {
     // With the default algorithm, use Cudnn's heuristics.
 
     // Lambda that retrieves the algorithm.
@@ -3178,7 +3257,9 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
     }
   } else {
     // An algorithm has been specified.
-    algo = ToConvBackwardFilterAlgo(algorithm_config.algorithm());
+    dnn::AlgorithmDesc algotype = algorithm_config.algorithm();
+    algo = ToConvBackwardFilterAlgo(algotype);
+    conv.set_use_tensor_op_math(algotype.tensor_ops_enabled());
 
     size_t size_in_bytes;
     status = wrap::cudnnGetConvolutionBackwardFilterWorkspaceSize(
@@ -3193,7 +3274,7 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
       }
       LOG(FATAL) << "Cannot query the size of workspace needed for the given "
                     "algorithm: "
-                 << algorithm_config.algorithm();
+                 << algorithm_config.algorithm().algo_id();
     }
     int64 size_in_bytes_int64 = size_in_bytes;
     if (size_in_bytes_int64 > 0) {
@@ -3212,11 +3293,12 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
         LOG(WARNING) << allocated.status().error_message();
       }
       if (scratch == nullptr) {
-        CHECK(algorithm_config.algorithm_no_scratch() != dnn::kDefaultAlgorithm)
+        CHECK(!algorithm_config.algorithm_no_scratch().is_default())
             << "The primary convolution algorithm failed memory allocation, "
                "while a secondary algorithm is not provided.";
-        algo =
-            ToConvBackwardFilterAlgo(algorithm_config.algorithm_no_scratch());
+        dnn::AlgorithmDesc algotype = algorithm_config.algorithm_no_scratch();
+        algo = ToConvBackwardFilterAlgo(algotype);
+        conv.set_use_tensor_op_math(algotype.tensor_ops_enabled());
       }
     } else if (size_in_bytes_int64 < 0) {
       LOG(WARNING)
@@ -3255,7 +3337,9 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
   if (is_profiling) {
     timer->Stop(AsCUDAStream(stream));
     if (status == CUDNN_STATUS_SUCCESS) {
-      output_profile_result->set_algorithm(algo);
+      bool use_tensor_ops = algorithm_config.algorithm().tensor_ops_enabled();
+      dnn::AlgorithmDesc algotype(algo, use_tensor_ops);
+      output_profile_result->set_algorithm(algotype);
       output_profile_result->set_elapsed_time_in_ms(
           timer->GetElapsedMilliseconds());
     }
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index db376e2a66..eaf06e179f 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -146,15 +146,15 @@ class CudnnSupport : public dnn::DnnSupport {
 
   bool GetConvolveAlgorithms(
       bool with_winograd_nonfused,
-      std::vector<dnn::AlgorithmType>* out_algorithms) override;
+      std::vector<dnn::AlgorithmDesc::Index>* out_algorithms) override;
 
   bool GetConvolveBackwardDataAlgorithms(
       bool with_winograd_nonfused,
-      std::vector<dnn::AlgorithmType>* out_algorithms) override;
+      std::vector<dnn::AlgorithmDesc::Index>* out_algorithms) override;
 
   bool GetConvolveBackwardFilterAlgorithms(
       bool with_winograd_nonfused,
-      std::vector<dnn::AlgorithmType>* out_algorithms) override;
+      std::vector<dnn::AlgorithmDesc::Index>* out_algorithms) override;
 
   bool DoBatchNormalizationForward(
       Stream* stream, const DeviceMemory<float>& x,
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index 311f45f748..ed9bdf2bc2 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -23,17 +23,20 @@ namespace gputools {
 namespace dnn {
 
 bool DnnSupport::GetConvolveAlgorithms(
-    bool with_winograd_nonfused, std::vector<AlgorithmType>* out_algorithms) {
+    bool with_winograd_nonfused,
+    std::vector<AlgorithmDesc::Index>* out_algorithms) {
   return false;
 }
 
 bool DnnSupport::GetConvolveBackwardDataAlgorithms(
-    bool with_winograd_nonfused, std::vector<AlgorithmType>* out_algorithms) {
+    bool with_winograd_nonfused,
+    std::vector<AlgorithmDesc::Index>* out_algorithms) {
   return false;
 }
 
 bool DnnSupport::GetConvolveBackwardFilterAlgorithms(
-    bool with_winograd_nonfused, std::vector<AlgorithmType>* out_algorithms) {
+    bool with_winograd_nonfused,
+    std::vector<AlgorithmDesc::Index>* out_algorithms) {
   return false;
 }
 
@@ -202,7 +205,8 @@ std::vector<int64> ReorderDims(const std::vector<int64>& input,
 // -- AlgorithmConfig
 
 string AlgorithmConfig::ToString() const {
-  return port::StrCat(algorithm_, ", ", algorithm_no_scratch_);
+  return port::StrCat(algorithm_.algo_id(), ", ",
+                      algorithm_no_scratch_.algo_id());
 }
 
 // -- BatchDescriptor
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 0a4525c1b7..b11c6417be 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -667,9 +667,26 @@ class PoolingDescriptor {
   std::vector<int64> strides_;
 };
 
-typedef int64 AlgorithmType;
-constexpr AlgorithmType kDefaultAlgorithm = -1;
-constexpr AlgorithmType kNoSuitableAlgorithmFound = -2;
+// Collects parameters for DNN algorithms
+class AlgorithmDesc {
+ public:
+  typedef int64 Index;
+  AlgorithmDesc() : algo_(kDefaultAlgorithm), tensor_ops_enabled_(false) {}
+  AlgorithmDesc(Index a, bool use_tensor_ops)
+      : algo_(a), tensor_ops_enabled_(use_tensor_ops) {}
+  bool is_default() const { return algo_ == kDefaultAlgorithm; }
+  bool tensor_ops_enabled() const { return tensor_ops_enabled_; }
+  Index algo_id() const { return algo_; }
+  bool operator==(const AlgorithmDesc& other) const {
+    return this->algo_ == other.algo_ &&
+           this->tensor_ops_enabled_ == other.tensor_ops_enabled_;
+  }
+
+ private:
+  enum { kDefaultAlgorithm = -1 };
+  Index algo_;
+  bool tensor_ops_enabled_;
+};
 
 // Describes the result from a perf experiment.
 //
@@ -679,16 +696,16 @@ constexpr AlgorithmType kNoSuitableAlgorithmFound = -2;
 class ProfileResult {
  public:
   bool is_valid() const {
-    return (algorithm_ != kDefaultAlgorithm &&
+    return (!algorithm_.is_default() &&
             elapsed_time_in_ms_ != std::numeric_limits<float>::max());
   }
-  AlgorithmType algorithm() const { return algorithm_; }
-  void set_algorithm(AlgorithmType val) { algorithm_ = val; }
+  AlgorithmDesc algorithm() const { return algorithm_; }
+  void set_algorithm(AlgorithmDesc val) { algorithm_ = val; }
   float elapsed_time_in_ms() const { return elapsed_time_in_ms_; }
   void set_elapsed_time_in_ms(float val) { elapsed_time_in_ms_ = val; }
 
  private:
-  AlgorithmType algorithm_ = kDefaultAlgorithm;
+  AlgorithmDesc algorithm_;
   float elapsed_time_in_ms_ = std::numeric_limits<float>::max();
 };
 
@@ -700,17 +717,14 @@ class ProfileResult {
 //    the allocation for the scratch memory fails.
 class AlgorithmConfig {
  public:
-  AlgorithmConfig()
-      : algorithm_(kDefaultAlgorithm),
-        algorithm_no_scratch_(kDefaultAlgorithm) {}
-  explicit AlgorithmConfig(AlgorithmType algorithm)
-      : algorithm_(algorithm), algorithm_no_scratch_(kDefaultAlgorithm) {}
-  AlgorithmConfig(AlgorithmType algorithm, AlgorithmType algorithm_no_scratch)
+  AlgorithmConfig() {}
+  explicit AlgorithmConfig(AlgorithmDesc algorithm) : algorithm_(algorithm) {}
+  AlgorithmConfig(AlgorithmDesc algorithm, AlgorithmDesc algorithm_no_scratch)
       : algorithm_(algorithm), algorithm_no_scratch_(algorithm_no_scratch) {}
-  AlgorithmType algorithm() const { return algorithm_; }
-  void set_algorithm(AlgorithmType val) { algorithm_ = val; }
-  AlgorithmType algorithm_no_scratch() const { return algorithm_no_scratch_; }
-  void set_algorithm_no_scratch(AlgorithmType val) {
+  AlgorithmDesc algorithm() const { return algorithm_; }
+  void set_algorithm(AlgorithmDesc val) { algorithm_ = val; }
+  AlgorithmDesc algorithm_no_scratch() const { return algorithm_no_scratch_; }
+  void set_algorithm_no_scratch(AlgorithmDesc val) {
     algorithm_no_scratch_ = val;
   }
   bool operator==(const AlgorithmConfig& other) const {
@@ -723,8 +737,8 @@ class AlgorithmConfig {
   string ToString() const;
 
  private:
-  AlgorithmType algorithm_;
-  AlgorithmType algorithm_no_scratch_;
+  AlgorithmDesc algorithm_;
+  AlgorithmDesc algorithm_no_scratch_;
 };
 
 // Describes a local response normalization (LRN). LRN is used e.g. in
@@ -944,8 +958,8 @@ class DnnSupport {
   //    convolution result.
   //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
   //    space in order to speed up the convolution operation.
-  //  algorithm: an integer to specify which algorithm should be used for the
-  //    operation. kDefaultAlgorithm means the system will pick an algorithm
+  //  algorithm: specifies which algorithm should be used for the
+  //    operation. If algorithm.is_default(), the system will pick an algorithm
   //    by default. The coding of the algorithm is be interpretted by the
   //    underlying implementation.
   //  output_profile_result: the output profile result for this call. The
@@ -1112,7 +1126,8 @@ class DnnSupport {
 
   // Return a list of algorithms supported by the forward convolution pass.
   virtual bool GetConvolveAlgorithms(
-      bool with_winograd_nonfused, std::vector<AlgorithmType>* out_algorithms);
+      bool with_winograd_nonfused,
+      std::vector<AlgorithmDesc::Index>* out_algorithms);
 
   // Version of DoConvolve that uses pre-quantized 8 bit coefficients.
   // coefficient_scales specifies the scaling of each column of coefficients:
@@ -1191,7 +1206,8 @@ class DnnSupport {
   // Return a list of algorithms supported by the backward convolution pass for
   // data.
   virtual bool GetConvolveBackwardDataAlgorithms(
-      bool with_winograd_nonfused, std::vector<AlgorithmType>* out_algorithms);
+      bool with_winograd_nonfused,
+      std::vector<AlgorithmDesc::Index>* out_algorithms);
 
   virtual bool DoConvolveBackwardData(
       Stream* stream, const FilterDescriptor& filter_descriptor,
@@ -1239,7 +1255,8 @@ class DnnSupport {
   // Return a list of algorithms supported by the backward convolution pass for
   // filters.
   virtual bool GetConvolveBackwardFilterAlgorithms(
-      bool with_winograd_nonfused, std::vector<AlgorithmType>* out_algorithms);
+      bool with_winograd_nonfused,
+      std::vector<AlgorithmDesc::Index>* out_algorithms);
 
   virtual bool DoConvolveBackwardFilter(
       Stream* stream, const BatchDescriptor& input_descriptor,
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index a418fe961c..98484eb850 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -70,7 +70,7 @@ class BatchDescriptor;
 class FilterDescriptor;
 class ConvolutionDescriptor;
 class ProfileResult;
-typedef int64 AlgorithmType;
+struct AlgorithmDesc;
 }  // namespace dnn
 
 class StreamExecutor;
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index e23d3ddace..199a908914 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -286,7 +286,7 @@ bool StreamExecutor::SupportsDnn() const {
 
 bool StreamExecutor::GetConvolveAlgorithms(
     bool with_winograd_nonfused,
-    std::vector<dnn::AlgorithmType> *out_algorithms) {
+    std::vector<dnn::AlgorithmDesc::Index> *out_algorithms) {
   dnn::DnnSupport *dnn_support = AsDnn();
   if (!dnn_support) {
     return false;
@@ -297,7 +297,7 @@ bool StreamExecutor::GetConvolveAlgorithms(
 
 bool StreamExecutor::GetConvolveBackwardDataAlgorithms(
     bool with_winograd_nonfused,
-    std::vector<dnn::AlgorithmType> *out_algorithms) {
+    std::vector<dnn::AlgorithmDesc::Index> *out_algorithms) {
   dnn::DnnSupport *dnn_support = AsDnn();
   if (!dnn_support) {
     return false;
@@ -308,7 +308,7 @@ bool StreamExecutor::GetConvolveBackwardDataAlgorithms(
 
 bool StreamExecutor::GetConvolveBackwardFilterAlgorithms(
     bool with_winograd_nonfused,
-    std::vector<dnn::AlgorithmType> *out_algorithms) {
+    std::vector<dnn::AlgorithmDesc::Index> *out_algorithms) {
   dnn::DnnSupport *dnn_support = AsDnn();
   if (!dnn_support) {
     return false;
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index d910eb8823..98136a92a0 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -343,19 +343,20 @@ class StreamExecutor {
   bool SupportsDnn() const;
 
   // Get the list of supported algorithms for the forward convolution opeartion.
-  bool GetConvolveAlgorithms(bool with_winograd_nonfused,
-                             std::vector<dnn::AlgorithmType> *out_algorithms);
+  bool GetConvolveAlgorithms(
+      bool with_winograd_nonfused,
+      std::vector<dnn::AlgorithmDesc::Index> *out_algorithms);
 
   // Get the list of supported algorithms for the backward convolution on data.
   bool GetConvolveBackwardDataAlgorithms(
       bool with_winograd_nonfused,
-      std::vector<dnn::AlgorithmType> *out_algorithms);
+      std::vector<dnn::AlgorithmDesc::Index> *out_algorithms);
 
   // Get the list of supported algorithms for the backward convolution on the
   // filter.
   bool GetConvolveBackwardFilterAlgorithms(
       bool with_winograd_nonfused,
-      std::vector<dnn::AlgorithmType> *out_algorithms);
+      std::vector<dnn::AlgorithmDesc::Index> *out_algorithms);
 
   // Get the list of supported algorithms for BLAS gemm.
   bool GetBlasGemmAlgorithms(std::vector<blas::AlgorithmType> *out_algorithms);
diff --git a/tensorflow/third_party/mpi/mpi.bzl b/tensorflow/third_party/mpi/mpi.bzl
new file mode 100644
index 0000000000..38ce91c4d0
--- /dev/null
+++ b/tensorflow/third_party/mpi/mpi.bzl
@@ -0,0 +1,17 @@
+#OpenMPI and Mvapich/mpich require different headers
+#based on the configuration options return one or the other
+
+def mpi_hdr():
+    MPI_LIB_IS_OPENMPI=True
+    hdrs = []    
+    if MPI_LIB_IS_OPENMPI:
+        hdrs = ["mpi.h", "mpi_portable_platform.h"]   #When using OpenMPI
+    else:
+        hdrs = ["mpi.h",  "mpio.h", "mpicxx.h"]        #When using MVAPICH
+    return hdrs
+
+def if_mpi(if_true, if_false = []):
+    return select({
+        "//tensorflow:with_mpi_support": if_true,
+        "//conditions:default": if_false
+    })
diff --git a/tensorflow/tools/ci_build/Dockerfile.cpu.mpi b/tensorflow/tools/ci_build/Dockerfile.cpu.mpi
new file mode 100644
index 0000000000..2bf7fd1d23
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.cpu.mpi
@@ -0,0 +1,24 @@
+FROM ubuntu:14.04
+
+LABEL authors="Andrew Gibiansky <andrew.gibiansky@gmail.com>, Joel Hestness <jthestness@gmail.com>"
+
+# Copy and run the install scripts.
+COPY install/*.sh /install/
+RUN /install/install_bootstrap_deb_packages.sh
+RUN add-apt-repository -y ppa:openjdk-r/ppa && \
+    add-apt-repository -y ppa:mc3man/trusty-media && \
+    add-apt-repository -y ppa:george-edison55/cmake-3.x
+RUN /install/install_deb_packages.sh
+RUN /install/install_pip_packages.sh
+RUN /install/install_bazel.sh
+RUN /install/install_proto3.sh
+RUN /install/install_buildifier.sh
+RUN /install/install_mpi.sh
+
+# Set up bazelrc.
+COPY install/.bazelrc /root/.bazelrc
+ENV BAZELRC /root/.bazelrc
+
+# Set up MPI
+ENV TF_NEED_MPI 1
+ENV MPI_HOME /usr/lib/openmpi
diff --git a/tensorflow/tools/ci_build/install/install_mpi.sh b/tensorflow/tools/ci_build/install/install_mpi.sh
new file mode 100755
index 0000000000..6ee9d76594
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/install_mpi.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set +e
+mpiexec=$(which mpiexec)
+if [[ -z "$mpiexec_location" ]]; then
+    # Install dependencies from ubuntu deb repository.
+    apt-get update
+    apt-get install -y --no-install-recommends openmpi-bin libopenmpi-dev
+fi
diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index f53bfb59ff..5244898c40 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -36,40 +36,46 @@ set -e
 
 yes '' | ./configure
 
-# We need to update the Eigen version, because of compiler failures on ARM when
-# using the version currently (Aug 10th 2017) pulled by mainline TensorFlow. We
-# should be able to get rid of this hack once
-# https://github.com/tensorflow/tensorflow/issues/9697 is addressed.
-sed -i 's/f3a22f35b044/d781c1de9834/g' tensorflow/workspace.bzl
-sed -i 's/ca7beac153d4059c02c8fc59816c82d54ea47fe58365e8aded4082ded0b820c4/a34b208da6ec18fa8da963369e166e4a368612c14d956dd2f9d7072904675d9b/g' tensorflow/workspace.bzl
-
 # Fix for curl build problem in 32-bit, see https://stackoverflow.com/questions/35181744/size-of-array-curl-rule-01-is-negative
 sudo sed -i 's/define CURL_SIZEOF_LONG 8/define CURL_SIZEOF_LONG 4/g' /usr/include/curl/curlbuild.h
 sudo sed -i 's/define CURL_SIZEOF_CURL_OFF_T 8/define CURL_SIZEOF_CURL_OFF_T 4/g' /usr/include/curl/curlbuild.h
 
+# The system-installed OpenSSL headers get pulled in by the latest BoringSSL
+# release on this configuration, so move them before we build:
+if [ -d /usr/include/openssl ]; then
+  sudo mv /usr/include/openssl /usr/include/openssl.original
+fi
+
+WORKSPACE_PATH=`pwd`
+
 # Build the OpenBLAS library, which is faster than Eigen on the Pi Zero/One.
 # TODO(petewarden) - It would be nicer to move this into the main Bazel build
 # process if we can maintain a build file for this.
-mkdir toolchain
-cd toolchain
+TOOLCHAIN_INSTALL_PATH=/tmp/toolchain_install/
+sudo rm -rf ${TOOLCHAIN_INSTALL_PATH}
+mkdir ${TOOLCHAIN_INSTALL_PATH}
+cd ${TOOLCHAIN_INSTALL_PATH}
 curl -L https://github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz -o toolchain.tar.gz
 tar xzf toolchain.tar.gz
 mv tools-0e906ebc527eab1cdbf7adabff5b474da9562e9f/ tools
-cd ..
 
-CROSSTOOL_CC=$(pwd)/toolchain/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin/arm-linux-gnueabihf-gcc
+CROSSTOOL_CC=${TOOLCHAIN_INSTALL_PATH}/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin/arm-linux-gnueabihf-gcc
 
-git clone https://github.com/xianyi/OpenBLAS openblas
-cd openblas
+OPENBLAS_SRC_PATH=/tmp/openblas_src/
+sudo rm -rf ${OPENBLAS_SRC_PATH}
+git clone https://github.com/xianyi/OpenBLAS ${OPENBLAS_SRC_PATH}
+cd ${OPENBLAS_SRC_PATH}
+# If this path is changed, you'll also need to update
+# cxx_builtin_include_directory in third_party/toolchains/cpus/arm/CROSSTOOL.tpl
+OPENBLAS_INSTALL_PATH=/tmp/openblas_install/
 make CC=${CROSSTOOL_CC} FC=${CROSSTOOL_CC} HOSTCC=gcc TARGET=ARMV6
-make PREFIX=$(pwd)/toolchain/openblas/ install
-cd ..
+make PREFIX=${OPENBLAS_INSTALL_PATH} install
 
 if [[ $1 == "PI_ONE" ]]; then
   PI_COPTS="--copt=-march=armv6 --copt=-mfpu=vfp
   --copt=-DUSE_GEMM_FOR_CONV --copt=-DUSE_OPENBLAS
-  --copt=-isystem=$(pwd)/toolchain/openblas/include/
-  --linkopt=-L$(pwd)/toolchain/openblas/lib/
+  --copt=-isystem --copt=${OPENBLAS_INSTALL_PATH}/include/
+  --linkopt=-L${OPENBLAS_INSTALL_PATH}/lib/
   --linkopt=-l:libopenblas.a"
   echo "Building for the Pi One/Zero, with no NEON support"
 else
@@ -80,7 +86,9 @@ else
   echo "Building for the Pi Two/Three, with NEON acceleration"
 fi
 
+cd ${WORKSPACE_PATH}
 bazel build -c opt ${PI_COPTS} \
+  --config=monolithic \
   --copt=-funsafe-math-optimizations --copt=-ftree-vectorize \
   --copt=-fomit-frame-pointer --cpu=armeabi \
   --crosstool_top=@local_config_arm_compiler//:toolchain \
@@ -88,15 +96,15 @@ bazel build -c opt ${PI_COPTS} \
   //tensorflow/tools/benchmark:benchmark_model \
   //tensorflow/tools/pip_package:build_pip_package
 
-OUTDIR=bazel-out/pi
-mkdir -p ${OUTDIR}
+OUTDIR=output-artifacts
+mkdir -p "${OUTDIR}"
 echo "Final outputs will go to ${OUTDIR}"
 
 # Build a universal wheel.
 BDIST_OPTS="--universal" \
   bazel-bin/tensorflow/tools/pip_package/build_pip_package "${OUTDIR}"
 
-OLD_FN=$(ls "${OUTDIR}" | grep \.whl)
+OLD_FN=$(ls "${OUTDIR}" | grep -m 1 \.whl)
 SUB='s/tensorflow-([^-]+)-([^-]+)-.*/tensorflow-\1-\2-none-any.whl/; print'
 NEW_FN=$(echo "${OLD_FN}" | perl -ne "${SUB}")
 mv "${OUTDIR}/${OLD_FN}" "${OUTDIR}/${NEW_FN}"
diff --git a/tensorflow/tools/ci_build/update_version.py b/tensorflow/tools/ci_build/update_version.py
index 4405678a6b..c7841f35aa 100755
--- a/tensorflow/tools/ci_build/update_version.py
+++ b/tensorflow/tools/ci_build/update_version.py
@@ -333,8 +333,10 @@ def main():
   old_version = get_current_semver_version()
 
   if args.nightly:
+    # dev minor version is one ahead of official
+    nightly_minor_ver = int(old_version.minor) + 1
     new_version = Version(old_version.major,
-                          old_version.minor,
+                          str(nightly_minor_ver),
                           old_version.patch,
                           "-dev" + time.strftime("%Y%m%d"),
                           NIGHTLY_VERSION)
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index 7f7bc06e54..6a8b6417d6 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -97,15 +97,7 @@ exclude_cpu_cc_tests="${failing_cpu_cc_tests} + ${broken_cpu_cc_tests}"
 exclude_gpu_cc_tests="${extra_failing_gpu_cc_tests} + ${exclude_cpu_cc_tests}"
 
 function clean_output_base() {
-  # TODO(pcloudy): bazel clean --expunge doesn't work on Windows yet.
-  # Clean the output base manually to ensure build correctness
-  bazel clean
-  output_base=$(bazel info output_base)
-  bazel shutdown
-  # Sleep 5s to wait for jvm shutdown completely
-  # otherwise rm will fail with device or resource busy error
-  sleep 5
-  rm -rf ${output_base}
+  bazel clean --expunge
 }
 
 function run_configure_for_cpu_build {
diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index 3aa034ef6e..4a653698a2 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
@@ -55,8 +55,11 @@ export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/extras/CUPT
 export PATH="/c/tools/cuda/bin:$PATH"
 
 # Set the common build options on Windows
-export BUILD_OPTS='--copt=-w --host_copt=-w --verbose_failures --experimental_ui'
+export BUILD_OPTS='--config=monolithic --copt=-w --host_copt=-w --verbose_failures --experimental_ui'
 
 # Build TF with wrapper-less CROSSTOOL
 # TODO(pcloudy): Remove this after wrapper-less CROSSTOOL becomes default
 export NO_MSVC_WRAPPER=1
+
+export USE_DYNAMIC_CRT=1
+
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 7fce91a469..60a94504b7 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -72,7 +72,7 @@ RUN mkdir /bazel && \
 
 RUN git clone https://github.com/tensorflow/tensorflow.git && \
     cd tensorflow && \
-    git checkout r1.4
+    git checkout r1.3
 WORKDIR /tensorflow
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 75dc4c2c4f..f5364d803a 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -73,7 +73,7 @@ RUN mkdir /bazel && \
 
 RUN git clone https://github.com/tensorflow/tensorflow.git && \
     cd tensorflow && \
-    git checkout r1.4
+    git checkout r1.3
 WORKDIR /tensorflow
 
 # Configure the build for our CUDA configuration.
diff --git a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
index fddb624853..614a19c178 100644
--- a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
+++ b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
@@ -1256,7 +1256,7 @@
     "  \n",
     "But, here, we'll want to keep the session open so we can poke at values as we work out the details of training. The TensorFlow API includes a function for this, `InteractiveSession`.\n",
     "\n",
-    "We'll start by creating a session and initializing the varibles we defined above."
+    "We'll start by creating a session and initializing the variables we defined above."
    ]
   },
   {
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 00dffc4d27..a7a0706d0b 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,7 +29,7 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.4.0-dev'
+_VERSION = '1.3.0'
 
 REQUIRED_PACKAGES = [
     'enum34 >= 1.1.6',
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 67d3929e8a..84e5c3ab61 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -158,7 +158,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "mkl",
       urls = [
           "http://mirror.bazel.build/github.com/01org/mkl-dnn/releases/download/v0.9/mklml_lnx_2018.0.20170720.tgz",
-          "https://github.com/01org/mkl-dnn/releases/download/v0.9/mklml_lnx_2018.0.20170720.tgz",
+          # "https://github.com/01org/mkl-dnn/releases/download/v0.9/mklml_lnx_2018.0.20170720.tgz",
       ],
       sha256 = "57ba56c4c243f403ff78f417ff854ef50b9eddf4a610a917b7c95e7fa8553a4b",
       strip_prefix = "mklml_lnx_2018.0.20170720",
@@ -173,11 +173,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.new_http_archive(
       name = "eigen_archive",
       urls = [
-          "http://mirror.bazel.build/bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz",
-          "https://bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz",
+          "https://bitbucket.org/eigen/eigen/get/429aa5254200.tar.gz",
+          "http://mirror.bazel.build/bitbucket.org/eigen/eigen/get/429aa5254200.tar.gz",
       ],
-      sha256 = "ca7beac153d4059c02c8fc59816c82d54ea47fe58365e8aded4082ded0b820c4",
-      strip_prefix = "eigen-eigen-f3a22f35b044",
+      sha256 = "61d8b6fc4279dd1dda986fb1677d15e3d641c07a3ea5abe255790b1f0c0c14e9",
+      strip_prefix = "eigen-eigen-429aa5254200",
       build_file = str(Label("//third_party:eigen.BUILD")),
   )
 
@@ -188,7 +188,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       strip_prefix = "tools-0e906ebc527eab1cdbf7adabff5b474da9562e9f/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf",
       urls = [
           "http://mirror.bazel.build/github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz",
-          "https://github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz",
+          # "https://github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz",
       ],
   )
 
@@ -196,7 +196,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "libxsmm_archive",
       urls = [
           "http://mirror.bazel.build/github.com/hfp/libxsmm/archive/1.8.1.tar.gz",
-          "https://github.com/hfp/libxsmm/archive/1.8.1.tar.gz",
+          # "https://github.com/hfp/libxsmm/archive/1.8.1.tar.gz",
       ],
       sha256 = "2ade869c3f42f23b5263c7d594aa3c7e5e61ac6a3afcaf5d6e42899d2a7986ce",
       strip_prefix = "libxsmm-1.8.1",
@@ -212,7 +212,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "ortools_archive",
       urls = [
           "http://mirror.bazel.build/github.com/google/or-tools/archive/253f7955c6a1fd805408fba2e42ac6d45b312d15.tar.gz",
-          "https://github.com/google/or-tools/archive/253f7955c6a1fd805408fba2e42ac6d45b312d15.tar.gz",
+          # "https://github.com/google/or-tools/archive/253f7955c6a1fd805408fba2e42ac6d45b312d15.tar.gz",
       ],
       sha256 = "932075525642b04ac6f1b50589f1df5cd72ec2f448b721fd32234cf183f0e755",
       strip_prefix = "or-tools-253f7955c6a1fd805408fba2e42ac6d45b312d15/src",
@@ -223,7 +223,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "com_googlesource_code_re2",
       urls = [
           "http://mirror.bazel.build/github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz",
-          "https://github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz",
+          # "https://github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz",
       ],
       sha256 = "bd63550101e056427c9e7ff12a408c1c8b74e9803f393ca916b2926fc2c4906f",
       strip_prefix = "re2-b94b7cd42e9f02673cd748c1ac1d16db4052514c",
@@ -232,22 +232,21 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.http_archive(
       name = "gemmlowp",
       urls = [
-          # TODO(gunan): reenable once mirror is updated.
-          # "http://mirror.bazel.build/github.com/google/gemmlowp/archive/010bb3e71a26ca1d0884a167081d092b43563996.tar.gz",
-          "https://github.com/google/gemmlowp/archive/010bb3e71a26ca1d0884a167081d092b43563996.tar.gz",
+          "http://mirror.bazel.build/github.com/google/gemmlowp/archive/010bb3e71a26ca1d0884a167081d092b43563996.zip"
+          # "https://github.com/google/gemmlowp/archive/010bb3e71a26ca1d0884a167081d092b43563996.zip",
       ],
-      sha256 = "861cc6d9d902861f54fd77e1ab79286477dcc559b2a283e75b9c22d37b61f6ae",
+      sha256 = "dd2557072bde12141419cb8320a9c25e6ec41a8ae53c2ac78c076a347bb46d9d",
       strip_prefix = "gemmlowp-010bb3e71a26ca1d0884a167081d092b43563996",
   )
 
   native.new_http_archive(
       name = "farmhash_archive",
       urls = [
-          "http://mirror.bazel.build/github.com/google/farmhash/archive/23eecfbe7e84ebf2e229bd02248f431c36e12f1a.tar.gz",
-          "https://github.com/google/farmhash/archive/23eecfbe7e84ebf2e229bd02248f431c36e12f1a.tar.gz",
+          "http://mirror.bazel.build/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz",
+          # "https://github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz",
       ],
-      sha256 = "e5c86a2e32e7cb1d027d713cbf338be68ebbea76dbb2b2fdaae918864d3f8f3d",
-      strip_prefix = "farmhash-23eecfbe7e84ebf2e229bd02248f431c36e12f1a",
+      sha256 = "6560547c63e4af82b0f202cb710ceabb3f21347a4b996db565a411da5b17aba0",
+      strip_prefix = "farmhash-816a4ae622e964763ca0862d9dbd19324a1eaf45",
       build_file = str(Label("//third_party:farmhash.BUILD")),
   )
 
@@ -260,7 +259,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "highwayhash",
       urls = [
           "http://mirror.bazel.build/github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
-          "https://github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
+          # "https://github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
       ],
       sha256 = "0f30a15b1566d93f146c8d149878a06e91d9bb7ec2cfd76906df62a82be4aac9",
       strip_prefix = "highwayhash-dfcb97ca4fe9277bf9dc1802dd979b071896453b",
@@ -282,7 +281,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "jpeg",
       urls = [
           "http://mirror.bazel.build/github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
-          "https://github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
+          # "https://github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
       ],
       sha256 = "c15a9607892113946379ccea3ca8b85018301b200754f209453ab21674268e77",
       strip_prefix = "libjpeg-turbo-1.5.1",
@@ -294,7 +293,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "png_archive",
       urls = [
           "http://mirror.bazel.build/github.com/glennrp/libpng/archive/v1.2.53.tar.gz",
-          "https://github.com/glennrp/libpng/archive/v1.2.53.tar.gz",
+          # "https://github.com/glennrp/libpng/archive/v1.2.53.tar.gz",
       ],
       sha256 = "716c59c7dfc808a4c368f8ada526932be72b2fcea11dd85dc9d88b1df1dfe9c2",
       strip_prefix = "libpng-1.2.53",
@@ -346,7 +345,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "com_github_andreif_codegen",
       urls = [
           "http://mirror.bazel.build/github.com/andreif/codegen/archive/1.0.tar.gz",
-          "https://github.com/andreif/codegen/archive/1.0.tar.gz",
+          # "https://github.com/andreif/codegen/archive/1.0.tar.gz",
       ],
       sha256 = "2dadd04a2802de27e0fe5a19b76538f6da9d39ff244036afa00c1bba754de5ee",
       strip_prefix = "codegen-1.0",
@@ -416,7 +415,8 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.http_archive(
       name = "nsync",
       urls = [
-          "https://github.com/google/nsync/archive/ad722c76c6e6653f66be2e1f69521b7f7517da55.tar.gz",
+          "http://mirror.bazel.build/github.com/google/nsync/archive/ad722c76c6e6653f66be2e1f69521b7f7517da55.tar.gz",
+          # "https://github.com/google/nsync/archive/ad722c76c6e6653f66be2e1f69521b7f7517da55.tar.gz",
       ],
       sha256 = "7dd8ca49319f77e8226cd020a9210a525f88ac26e7041c59c95418223a1cdf55",
       strip_prefix = "nsync-ad722c76c6e6653f66be2e1f69521b7f7517da55",
@@ -426,7 +426,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "com_google_googletest",
       urls = [
           "http://mirror.bazel.build/github.com/google/googletest/archive/9816b96a6ddc0430671693df90192bbee57108b6.zip",
-          "https://github.com/google/googletest/archive/9816b96a6ddc0430671693df90192bbee57108b6.zip",
+          # "https://github.com/google/googletest/archive/9816b96a6ddc0430671693df90192bbee57108b6.zip",
       ],
       sha256 = "9cbca84c4256bed17df2c8f4d00c912c19d247c11c9ba6647cd6dd5b5c996b8d",
       strip_prefix = "googletest-9816b96a6ddc0430671693df90192bbee57108b6",
@@ -436,7 +436,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "com_github_gflags_gflags",
       urls = [
           "http://mirror.bazel.build/github.com/gflags/gflags/archive/f8a0efe03aa69b3336d8e228b37d4ccb17324b88.tar.gz",
-          "https://github.com/gflags/gflags/archive/f8a0efe03aa69b3336d8e228b37d4ccb17324b88.tar.gz",
+          # "https://github.com/gflags/gflags/archive/f8a0efe03aa69b3336d8e228b37d4ccb17324b88.tar.gz",
       ],
       sha256 = "4d222fab8f1ede4709cdff417d15a1336f862d7334a81abf76d09c15ecf9acd1",
       strip_prefix = "gflags-f8a0efe03aa69b3336d8e228b37d4ccb17324b88",
@@ -505,7 +505,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "grpc",
       urls = [
           "http://mirror.bazel.build/github.com/grpc/grpc/archive/781fd6f6ea03645a520cd5c675da67ab61f87e4b.tar.gz",
-          "https://github.com/grpc/grpc/archive/781fd6f6ea03645a520cd5c675da67ab61f87e4b.tar.gz",
+          # "https://github.com/grpc/grpc/archive/781fd6f6ea03645a520cd5c675da67ab61f87e4b.tar.gz",
       ],
       sha256 = "2004635e6a078acfac8ffa71738397796be4f8fb72f572cc44ecee5d99511d9f",
       strip_prefix = "grpc-781fd6f6ea03645a520cd5c675da67ab61f87e4b",
@@ -529,7 +529,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       sha256 = "7f51f45887a3d31b4ce4fa5965210a5e64637ceac12720cfce7954d6a2e812f7",
       urls = [
           "http://mirror.bazel.build/github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
-          "https://github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
+          # "https://github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
       ],
       strip_prefix = "linenoise-c894b9e59f02203dbe4e2be657572cf88c4230c3",
       build_file = str(Label("//third_party:linenoise.BUILD")),
@@ -553,7 +553,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "lmdb",
       urls = [
           "http://mirror.bazel.build/github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz",
-          "https://github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz",
+          # "https://github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz",
       ],
       sha256 = "108532fb94c6f227558d45be3f3347b52539f0f58290a7bb31ec06c462d05326",
       strip_prefix = "lmdb-LMDB_0.9.19/libraries/liblmdb",
@@ -564,7 +564,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "jsoncpp_git",
       urls = [
           "http://mirror.bazel.build/github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
-          "https://github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
+          # "https://github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
       ],
       sha256 = "07d34db40593d257324ec5fb9debc4dc33f29f8fb44e33a2eeb35503e61d0fe2",
       strip_prefix = "jsoncpp-11086dd6a7eba04289944367ca82cea71299ed70",
@@ -580,7 +580,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "boringssl",
       urls = [
           "http://mirror.bazel.build/github.com/google/boringssl/archive/e3860009a091cd1bd2bc189cdbc3c6d095abde84.tar.gz",
-          "https://github.com/google/boringssl/archive/e3860009a091cd1bd2bc189cdbc3c6d095abde84.tar.gz",  # 2017-07-07
+          # "https://github.com/google/boringssl/archive/e3860009a091cd1bd2bc189cdbc3c6d095abde84.tar.gz",  # 2017-07-07
       ],
       sha256 = "02f5950f93c4fd3691771c07c9d04cf2999ab01383ff99da345249e93b0fcfb2",
       strip_prefix = "boringssl-e3860009a091cd1bd2bc189cdbc3c6d095abde84",
@@ -618,7 +618,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "snappy",
       urls = [
           "http://mirror.bazel.build/github.com/google/snappy/archive/1.1.4.tar.gz",
-          "https://github.com/google/snappy/archive/1.1.4.tar.gz",
+          # "https://github.com/google/snappy/archive/1.1.4.tar.gz",
       ],
       sha256 = "2f7504c73d85bac842e893340333be8cb8561710642fc9562fccdd9d2c3fcc94",
       strip_prefix = "snappy-1.1.4",
@@ -630,7 +630,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "nccl_archive",
       urls = [
           "http://mirror.bazel.build/github.com/nvidia/nccl/archive/03d856977ecbaac87e598c0c4bafca96761b9ac7.tar.gz",
-          "https://github.com/nvidia/nccl/archive/03d856977ecbaac87e598c0c4bafca96761b9ac7.tar.gz",
+          # "https://github.com/nvidia/nccl/archive/03d856977ecbaac87e598c0c4bafca96761b9ac7.tar.gz",
       ],
       sha256 = "2ca86fb6179ecbff789cc67c836139c1bbc0324ed8c04643405a30bf26325176",
       strip_prefix = "nccl-03d856977ecbaac87e598c0c4bafca96761b9ac7",
@@ -638,6 +638,17 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       repository = tf_repo_name,
   )
 
+  temp_workaround_http_archive(
+      name = "aws",
+      urls = [
+          "http://bazel-mirror.storage.googleapis.com/github.com/aws/aws-sdk-cpp/archive/1.0.90.tar.gz",
+          # "https://github.com/aws/aws-sdk-cpp/archive/1.0.90.tar.gz",
+      ],
+      sha256 = "f599b57aec4f03ad696044dd430b2d201864113937353adc346f53ad47991319",
+      strip_prefix = "aws-sdk-cpp-1.0.90",
+      build_file = str(Label("//third_party:aws.BUILD")),
+  )
+
   java_import_external(
       name = "junit",
       jar_sha256 = "59721f0805e223d84b90677887d9ff567dc534d7c502ca903c0c2b17f05c116a",
@@ -667,7 +678,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "jemalloc",
       urls = [
           "http://mirror.bazel.build/github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
-          "https://github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
+          # "https://github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
       ],
       sha256 = "3c8f25c02e806c3ce0ab5fb7da1817f89fc9732709024e2a81b6b82f7cc792a8",
       strip_prefix = "jemalloc-4.4.0",
@@ -679,7 +690,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "com_google_pprof",
       urls = [
           "http://mirror.bazel.build/github.com/google/pprof/archive/c0fb62ec88c411cc91194465e54db2632845b650.tar.gz",
-          "https://github.com/google/pprof/archive/c0fb62ec88c411cc91194465e54db2632845b650.tar.gz",
+          # "https://github.com/google/pprof/archive/c0fb62ec88c411cc91194465e54db2632845b650.tar.gz",
       ],
       sha256 = "e0928ca4aa10ea1e0551e2d7ce4d1d7ea2d84b2abbdef082b0da84268791d0c4",
       strip_prefix = "pprof-c0fb62ec88c411cc91194465e54db2632845b650",
@@ -690,7 +701,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "cub_archive",
       urls = [
           "http://mirror.bazel.build/github.com/NVlabs/cub/archive/1.7.3.zip",
-          "https://github.com/NVlabs/cub/archive/1.7.3.zip",
+          # "https://github.com/NVlabs/cub/archive/1.7.3.zip",
       ],
       sha256 = "b7ead9e291d34ffa8074243541c1380d63be63f88de23de8ee548db573b72ebe",
       strip_prefix = "cub-1.7.3",
@@ -706,7 +717,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "bazel_toolchains",
       urls = [
           "http://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/9dbd803ad3b9447430a296810197b09b3a710956.tar.gz",
-          "https://github.com/bazelbuild/bazel-toolchains/archive/9dbd803ad3b9447430a296810197b09b3a710956.tar.gz",
+          # "https://github.com/bazelbuild/bazel-toolchains/archive/9dbd803ad3b9447430a296810197b09b3a710956.tar.gz",
       ],
       sha256 = "0799aa12db5260a499beb40f81744e760c59d055bfc5d271dd2c2ed4d5419faa",
       strip_prefix = "bazel-toolchains-9dbd803ad3b9447430a296810197b09b3a710956",
diff --git a/third_party/toolchains/cpus/arm/CROSSTOOL.tpl b/third_party/toolchains/cpus/arm/CROSSTOOL.tpl
index 04e399bed1..ad7f5596d0 100644
--- a/third_party/toolchains/cpus/arm/CROSSTOOL.tpl
+++ b/third_party/toolchains/cpus/arm/CROSSTOOL.tpl
@@ -77,7 +77,9 @@ toolchain {
   cxx_builtin_include_directory: "%{ARM_COMPILER_PATH}%/lib/gcc/arm-linux-gnueabihf/4.9.3/include-fixed"
   cxx_builtin_include_directory: "%{ARM_COMPILER_PATH}%/local_include"
   cxx_builtin_include_directory: "/usr/include"
-  cxx_builtin_include_directory: "/workspace/toolchain/openblas/include/"
+  # The path below must match the one used in
+  # tensorflow/tools/ci_build/pi/build_raspberry_pi.sh.
+  cxx_builtin_include_directory: "/tmp/openblas_install/include/"
   cxx_flag: "-std=c++11"
   # The cxx_builtin_include_directory directives don't seem to be adding these, so
   # explicitly set them as flags. There's a query to the Bazel team outstanding about