Revert "Branch 175277161"

author: Martin Wicke <martin.wicke@gmail.com> 2017-11-10 12:26:11 -0800
committer: GitHub <noreply@github.com> 2017-11-10 12:26:11 -0800
commit: d0a5d885d61b837018cb931a4d577289acc826fc (patch)
tree: dd344e45c4eca857c02746ef50d990a9cd81ea69
parent: 047d7965d2877d7b55f4cdb0d0abdcd733f266a9 (diff)
542 files changed, 10691 insertions, 20607 deletions
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index 10fd595fec..cfc45049f7 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -42,7 +42,7 @@ The Code of Conduct also applies within project spaces and in public spaces when
 
 Conflicts in an open source project can take many forms, from someone having a bad day and using harsh and hurtful language in the issue queue, to more serious instances such as sexist/racist statements or threats of violence, and everything in between.
 
-If the behaviour is threatening or harassing, or for other reasons requires immediate escalation, please see below.
+If the behavior is threatening or harassing, or for other reasons requires immediate escalation, please see below.
 
 However, for the vast majority of issues, we aim to empower individuals to first resolve conflicts themselves, asking for help when needed, and only after that fails to escalate further. This approach gives people more control over the outcome of their dispute. 
 
diff --git a/README.md b/README.md
index 24bbb6cec1..aff3427bdd 100644
--- a/README.md
+++ b/README.md
@@ -73,11 +73,11 @@ $ python
 
 ## For more information
 
-* [TensorFlow website](https://www.tensorflow.org)
+* [TensorFlow Website](https://www.tensorflow.org)
 * [TensorFlow White Papers](https://www.tensorflow.org/about/bib)
 * [TensorFlow Model Zoo](https://github.com/tensorflow/models)
 * [TensorFlow MOOC on Udacity](https://www.udacity.com/course/deep-learning--ud730)
-* [TensorFlow course at Stanford](https://web.stanford.edu/class/cs20si)
+* [TensorFlow Course at Stanford](https://web.stanford.edu/class/cs20si)
 
 Learn more about the TensorFlow community at the [community page of tensorflow.org](https://www.tensorflow.org/community) for a few ways to participate.
 
diff --git a/configure.py b/configure.py
index e98367ef9f..83ee01c630 100644
--- a/configure.py
+++ b/configure.py
@@ -43,6 +43,7 @@ _DEFAULT_CUDA_PATH_WIN = ('C:/Program Files/NVIDIA GPU Computing '
                           'Toolkit/CUDA/v%s' % _DEFAULT_CUDA_VERSION)
 _TF_OPENCL_VERSION = '1.2'
 _DEFAULT_COMPUTECPP_TOOLKIT_PATH = '/usr/local/computecpp'
+_DEFAULT_TRISYCL_INCLUDE_DIR = '/usr/local/triSYCL/include'
 
 
 def is_windows():
@@ -487,11 +488,10 @@ def set_cc_opt_flags(environ_cp):
   cc_opt_flags = get_from_env_or_user_or_default(environ_cp, 'CC_OPT_FLAGS',
                                                  question, default_cc_opt_flags)
   for opt in cc_opt_flags.split():
-    write_to_bazelrc('build:opt --cxxopt=%s --copt=%s' % (opt, opt))
-  host_opt = '-march=native'  # It should be safe on the same build host.
-  write_to_bazelrc(
-      'build:opt --host_cxxopt=%s --host_copt=%s' % (host_opt, host_opt))
-  write_to_bazelrc('build:opt --define with_default_optimizations=true')
+    host_opt = '-march=native'  # It should be safe on the same build host.
+    write_to_bazelrc(
+        'build:opt --cxxopt=%s --copt=%s' % (opt, opt) +
+        ' --host_cxxopt=%s --host_copt=%s' % (host_opt, host_opt))
 
 
 def set_tf_cuda_clang(environ_cp):
@@ -641,7 +641,7 @@ def set_tf_cuda_version(environ_cp):
   write_action_env_to_bazelrc('TF_CUDA_VERSION', tf_cuda_version)
 
 
-def set_tf_cunn_version(environ_cp):
+def set_tf_cudnn_version(environ_cp):
   """Set CUDNN_INSTALL_PATH and TF_CUDNN_VERSION."""
   ask_cudnn_version = (
       'Please specify the cuDNN version you want to use. '
@@ -887,6 +887,27 @@ def set_computecpp_toolkit_path(environ_cp):
   write_action_env_to_bazelrc('COMPUTECPP_TOOLKIT_PATH',
                               computecpp_toolkit_path)
 
+def set_trisycl_include_dir(environ_cp):
+  """Set TRISYCL_INCLUDE_DIR"""
+  ask_trisycl_include_dir = ('Please specify the location of the triSYCL '
+                             'include directory. (Use --config=sycl_trisycl '
+                             'when building with Bazel) '
+                             '[Default is %s]: '
+                             ) % (_DEFAULT_TRISYCL_INCLUDE_DIR)
+  while True:
+    trisycl_include_dir = get_from_env_or_user_or_default(
+      environ_cp, 'TRISYCL_INCLUDE_DIR', ask_trisycl_include_dir,
+      _DEFAULT_TRISYCL_INCLUDE_DIR)
+    if os.path.exists(trisycl_include_dir):
+      break
+
+    print('Invalid triSYCL include directory, %s cannot be found'
+          % (trisycl_include_dir))
+
+  # Set TRISYCL_INCLUDE_DIR
+  environ_cp['TRISYCL_INCLUDE_DIR'] = trisycl_include_dir
+  write_action_env_to_bazelrc('TRISYCL_INCLUDE_DIR',
+                              trisycl_include_dir)
 
 def set_mpi_home(environ_cp):
   """Set MPI_HOME."""
@@ -999,6 +1020,8 @@ def main():
     environ_cp['TF_NEED_GCP'] = '0'
     environ_cp['TF_NEED_HDFS'] = '0'
     environ_cp['TF_NEED_JEMALLOC'] = '0'
+    environ_cp['TF_NEED_OPENCL_SYCL'] = '0'
+    environ_cp['TF_NEED_COMPUTECPP'] = '0'
     environ_cp['TF_NEED_OPENCL'] = '0'
     environ_cp['TF_NEED_S3'] = '0'
     environ_cp['TF_CUDA_CLANG'] = '0'
@@ -1021,17 +1044,21 @@ def main():
   set_build_var(environ_cp, 'TF_NEED_VERBS', 'VERBS', 'with_verbs_support',
                 False, 'verbs')
 
-  set_action_env_var(environ_cp, 'TF_NEED_OPENCL', 'OpenCL', False)
-  if environ_cp.get('TF_NEED_OPENCL') == '1':
+  set_action_env_var(environ_cp, 'TF_NEED_OPENCL_SYCL', 'OpenCL SYCL', False)
+  if environ_cp.get('TF_NEED_OPENCL_SYCL') == '1':
     set_host_cxx_compiler(environ_cp)
     set_host_c_compiler(environ_cp)
-    set_computecpp_toolkit_path(environ_cp)
+    set_action_env_var(environ_cp, 'TF_NEED_COMPUTECPP', 'ComputeCPP', True)
+    if environ_cp.get('TF_NEED_COMPUTECPP') == '1':
+      set_computecpp_toolkit_path(environ_cp)
+    else:
+      set_trisycl_include_dir(environ_cp)
 
   set_action_env_var(environ_cp, 'TF_NEED_CUDA', 'CUDA', False)
   if (environ_cp.get('TF_NEED_CUDA') == '1' and
       'TF_CUDA_CONFIG_REPO' not in environ_cp):
     set_tf_cuda_version(environ_cp)
-    set_tf_cunn_version(environ_cp)
+    set_tf_cudnn_version(environ_cp)
     set_tf_cuda_compute_capabilities(environ_cp)
 
     set_tf_cuda_clang(environ_cp)
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 8cb7edcc50..9874f95ea3 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -55,6 +55,15 @@ config_setting(
 )
 
 config_setting(
+    name = "raspberry_pi_armeabi",
+    values = {
+        "crosstool_top": "@local_config_arm_compiler//:toolchain",
+        "cpu": "armeabi",
+    },
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
     name = "android_arm",
     values = {
         "crosstool_top": "//external:android/crosstool",
@@ -110,7 +119,7 @@ config_setting(
 
 config_setting(
     name = "no_tensorflow_py_deps",
-    define_values = {"no_tensorflow_py_deps": "true"},
+    values = {"define": "no_tensorflow_py_deps=true"},
     visibility = ["//visibility:public"],
 )
 
@@ -166,122 +175,55 @@ config_setting(
 # TODO(jhseu): Enable on other platforms other than Linux.
 config_setting(
     name = "with_jemalloc_linux_x86_64",
-    define_values = {"with_jemalloc": "true"},
-    values = {"cpu": "k8"},
+    values = {
+        "cpu": "k8",
+        "define": "with_jemalloc=true",
+    },
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "with_jemalloc_linux_ppc64le",
-    define_values = {"with_jemalloc": "true"},
-    values = {"cpu": "ppc"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "with_default_optimizations",
-    define_values = {"with_default_optimizations": "true"},
+    values = {
+        "cpu": "ppc",
+        "define": "with_jemalloc=true",
+    },
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "with_gcp_support",
-    define_values = {"with_gcp_support": "true"},
+    values = {"define": "with_gcp_support=true"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "with_hdfs_support",
-    define_values = {"with_hdfs_support": "true"},
+    values = {"define": "with_hdfs_support=true"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "with_s3_support",
-    define_values = {"with_s3_support": "true"},
-    visibility = ["//visibility:public"],
-)
-
-# Crosses between platforms and file system libraries not supported on those
-# platforms due to limitations in nested select() statements.
-config_setting(
-    name = "with_gcp_support_windows_override",
-    define_values = {"with_gcp_support": "true"},
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "with_hdfs_support_windows_override",
-    define_values = {"with_hdfs_support": "true"},
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "with_s3_support_windows_override",
-    define_values = {"with_s3_support": "true"},
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "with_gcp_support_android_override",
-    define_values = {"with_gcp_support": "true"},
-    values = {"crosstool_top": "//external:android/crosstool"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "with_hdfs_support_android_override",
-    define_values = {"with_hdfs_support": "true"},
-    values = {"crosstool_top": "//external:android/crosstool"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "with_s3_support_android_override",
-    define_values = {"with_s3_support": "true"},
-    values = {"crosstool_top": "//external:android/crosstool"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "with_gcp_support_ios_override",
-    define_values = {"with_gcp_support": "true"},
-    values = {"crosstool_top": "//tools/osx/crosstool:crosstool"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "with_hdfs_support_ios_override",
-    define_values = {"with_hdfs_support": "true"},
-    values = {"crosstool_top": "//tools/osx/crosstool:crosstool"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "with_s3_support_ios_override",
-    define_values = {"with_s3_support": "true"},
-    values = {"crosstool_top": "//tools/osx/crosstool:crosstool"},
+    values = {"define": "with_s3_support=true"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "with_xla_support",
-    define_values = {"with_xla_support": "true"},
+    values = {"define": "with_xla_support=true"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "with_gdr_support",
-    define_values = {"with_gdr_support": "true"},
+    values = {"define": "with_gdr_support=true"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "with_verbs_support",
-    define_values = {"with_verbs_support": "true"},
+    values = {"define": "with_verbs_support=true"},
     visibility = ["//visibility:public"],
 )
 
@@ -355,7 +297,7 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
-# Make a dummy rule that we can change "default" in select statements to.
+# Make a dummy rule that we can chaqnge "default" in select statements to.
 # to disable dependencies in copybara.
 config_setting(
     name = "dummy_disabled_internal",
@@ -384,6 +326,14 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
+py_library(
+    name = "tensorflow_py",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = ["//tensorflow/python"],
+)
+
 filegroup(
     name = "all_opensource_files",
     data = [
@@ -737,11 +687,3 @@ tf_cc_shared_object(
         "//tensorflow/core:tensorflow",
     ],
 )
-
-py_library(
-    name = "tensorflow_py",
-    srcs = ["__init__.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = ["//tensorflow/python"],
-)
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index dd638de3c6..6dd1b99910 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -890,8 +890,8 @@ const tensorflow::AttrValue* GetAttrValue(TF_Operation* oper,
                                           TF_Status* status) {
   const tensorflow::AttrValue* attr = oper->node.attrs().Find(attr_name);
   if (attr == nullptr) {
-    status->status = InvalidArgument("Operation '", oper->node.name(),
-                                     "' has no attr named '", attr_name, "'.");
+    status->status =
+        InvalidArgument("Operation has no attr named '", attr_name, "'.");
   }
   return attr;
 }
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index e0057eb51c..05881e619b 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -383,7 +383,7 @@ TEST(CAPI, Graph) {
   EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s));
 
   ASSERT_FALSE(GetAttrValue(feed, "missing", &attr_value, s));
-  EXPECT_EQ(string("Operation 'feed' has no attr named 'missing'."),
+  EXPECT_EQ(string("Operation has no attr named 'missing'."),
             string(TF_Message(s)));
 
   // Make a constant oper with the scalar "3".
@@ -1054,7 +1054,7 @@ class CApiColocationTest : public ::testing::Test {
         TF_OperationGetAttrMetadata(op, tensorflow::kColocationAttrName, s_);
     if (expected.empty()) {
       ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_)) << TF_Message(s_);
-      EXPECT_EQ(std::string("Operation 'add' has no attr named '_class'."),
+      EXPECT_EQ(std::string("Operation has no attr named '_class'."),
                 std::string(TF_Message(s_)));
       return;
     }
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index d533758e36..c77896b80b 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -39,7 +39,6 @@ tf_cuda_library(
 tf_cuda_library(
     name = "c_api_internal",
     hdrs = ["c_api_internal.h"],
-    visibility = ["//tensorflow:internal"],
     deps = [
         ":c_api",
         ":runtime",
@@ -106,6 +105,7 @@ tf_cc_test(
 
 cc_library(
     name = "tape",
+    srcs = ["tape.cc"],
     hdrs = ["tape.h"],
     visibility = ["//tensorflow:internal"],
     deps = [
diff --git a/tensorflow/c/eager/tape.cc b/tensorflow/c/eager/tape.cc
new file mode 100644
index 0000000000..464612a81e
--- /dev/null
+++ b/tensorflow/c/eager/tape.cc
@@ -0,0 +1,102 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/eager/tape.h"
+
+namespace tensorflow {
+namespace eager {
+
+bool GradientTape::ShouldRecord(gtl::ArraySlice<int64> tensor_ids) {
+  for (int64 i : tensor_ids) {
+    if (tensor_tape_.find(i) != tensor_tape_.end()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void GradientTape::Watch(int64 tensor_id) {
+  tensor_tape_.emplace(tensor_id, -1);
+}
+
+void GradientTape::RecordOperation(
+    const string& op_type, gtl::ArraySlice<TapeTensor> output_tensors,
+    gtl::ArraySlice<int64> input_tensor_id, void* backward_function,
+    const std::function<void()>& backward_function_deleter) {
+  if (!ShouldRecord(input_tensor_id)) {
+    backward_function_deleter();
+    return;
+  }
+  std::vector<int64> ids;
+  ids.reserve(input_tensor_id.size());
+  for (int64 i : input_tensor_id) {
+    tensor_usage_[i]++;
+    ids.push_back(i);
+  }
+  const int64 op_id = next_op_id_++;
+  std::vector<TapeTensor> tensors;
+  tensors.reserve(output_tensors.size());
+  for (const TapeTensor& o : output_tensors) {
+    // Note: the tensor can have already been watched and hence be in the tape,
+    // so we cannot check that we're inserting it here.
+    tensor_tape_[o.id] = op_id;
+    tensor_usage_[o.id] = 1;
+    tensors.push_back(o);
+  }
+  op_tape_[op_id] = OpTapeEntry{op_type, tensors, ids, backward_function,
+                                backward_function_deleter};
+}
+
+void GradientTape::DeleteTrace(int64 tensor_id) {
+  auto it = tensor_usage_.find(tensor_id);
+  if (it == tensor_usage_.end()) {
+    return;
+  }
+  it->second--;
+  if (it->second != 0) {
+    return;
+  }
+  tensor_usage_.erase(it);
+  auto tensor_op_it = tensor_tape_.find(tensor_id);
+  if (tensor_op_it == tensor_tape_.end()) {
+    return;
+  }
+  const int64 op_id = tensor_op_it->second;
+  if (op_id == -1) {
+    // Do not delete watched tensors.
+    return;
+  }
+  tensor_tape_.erase(tensor_op_it);
+  auto op_it = op_tape_.find(op_id);
+  CHECK(op_it != op_tape_.end());
+  for (const auto& output : op_it->second.output_tensor_info) {
+    if (tensor_usage_.find(output.id) != tensor_usage_.end()) {
+      // Found a usage for an output, so cannot delete the op.
+      return;
+    }
+  }
+  for (int64 id : op_it->second.input_tensor_id) {
+    DeleteTrace(id);
+  }
+  op_it->second.backward_function_deleter();
+  op_tape_.erase(op_it);
+}
+
+std::pair<TensorTape, OpTape> GradientTape::Export() {
+  return {std::move(tensor_tape_), std::move(op_tape_)};
+}
+
+}  // namespace eager
+}  // namespace tensorflow
diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index 29d73c5ca4..df51f300eb 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -19,7 +19,6 @@ limitations under the License.
 // maintains the data structures required to do so.
 
 #include <unordered_map>
-#include <unordered_set>
 #include <vector>
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
@@ -37,14 +36,13 @@ struct TapeTensor {
 };
 
 // Represents an entry in the tape.
-template <typename BackwardFunction>
 struct OpTapeEntry {
   string op_type;
   std::vector<TapeTensor> output_tensor_info;
   std::vector<int64> input_tensor_id;
 
   // TODO(apassos) consider narrowing down this interface.
-  BackwardFunction* backward_function;
+  void* backward_function;
 
   // Should be called before deleting the backward function. TODO(apassos) use
   // unique_ptrs to ensure this happens.
@@ -57,68 +55,13 @@ struct OpTapeEntry {
 using TensorTape = std::unordered_map<int64, int64>;
 
 // Map from operation-id to tape entry.
-template <typename BackwardFunction>
-using OpTape = std::unordered_map<int64, OpTapeEntry<BackwardFunction>>;
-
-// Operations the tape needs to perform on tensors to do backpropagation. Named
-// "vspace" because a subset of these are related to a vector space, such as
-// adding gradients, getting zeroes, etc. Currently cannot be implemented
-// without using tensorflow python code, hence left unspecified here.
-//
-// Gradient is the type returned by gradient functions. In Python TF it's either
-// Tensor or IndexedSlices or None, which here we map to nullptr. Gradients need
-// to allow their size to be computed and they need to be passable to a backward
-// function and deleted (as the backprop code creates lots of gradients the user
-// is not interested in).
-//
-// BackwardFunction needs to be a closure which stores intermediate activations
-// from the forward computation and calls a vector-jacobian product function
-// (also known as adjoint function) to compute, given downstream gradients,
-// upstream gradients.
-//
-// TODO(apassos) provide concrete template instantiations for TFE_TensorHandle
-// specialization, which is blocked by quite a few things needing to loop back
-// into python now.
-template <typename Gradient, typename BackwardFunction>
-class VSpace {
- public:
-  virtual ~VSpace() {}
-
-  // Returns the number of elements in the gradient tensor.
-  virtual int64 NumElements(Gradient* tensor) const = 0;
-
-  // Consumes references to the tensors in the gradient_tensors list and returns
-  // a tensor with the result.
-  virtual Gradient* AggregateGradients(
-      gtl::ArraySlice<Gradient*> gradient_tensors) const = 0;
-
-  // Returns a tensor of the right shape and dtype filled with zeros.
-  virtual Gradient* Zeros(TensorShape shape, DataType dtype) const = 0;
-
-  // Returns a Tensor which is filled with ones and like the input.
-  virtual Gradient* Ones(TensorShape shape, DataType dtype) const = 0;
-
-  // Calls the passed-in backward function.
-  virtual Status CallBackwardFunction(
-      BackwardFunction* backward_function,
-      gtl::ArraySlice<Gradient*> output_gradients,
-      std::vector<Gradient*>* result) const = 0;
-
-  // Deletes the input tensor.
-  virtual void DeleteGradient(Gradient* gradient) const = 0;
-};
+using OpTape = std::unordered_map<int64, OpTapeEntry>;
 
 // Traces the execution of operations, doing eager garbage collection, and
 // exporting a full trace so other code can do backpropagation. Not thread-safe.
-template <typename Gradient, typename BackwardFunction>
 class GradientTape {
  public:
   GradientTape() {}
-  ~GradientTape() {
-    for (const auto& pair : op_tape_) {
-      pair.second.backward_function_deleter();
-    }
-  }
 
   bool ShouldRecord(gtl::ArraySlice<int64> tensor_ids);
 
@@ -127,24 +70,19 @@ class GradientTape {
   void RecordOperation(const string& op_type,
                        gtl::ArraySlice<TapeTensor> output_tensors,
                        gtl::ArraySlice<int64> input_tensor_id,
-                       BackwardFunction* backward_function,
+                       void* backward_function,
                        const std::function<void()>& backward_function_deleter);
 
   void DeleteTrace(int64 tensor_id);
 
-  // Consumes the internal state of the tape (so cannot be called more than
-  // once) and produces the gradient of the target tensors with respect to the
-  // source tensors. The output gradients are used if not empty and not
-  // null. The result is populated with one tensor per target element.
-  Status ComputeGradient(const VSpace<Gradient, BackwardFunction>& vspace,
-                         gtl::ArraySlice<int64> target_tensor_ids,
-                         gtl::ArraySlice<int64> source_tensor_id,
-                         gtl::ArraySlice<Gradient*> output_gradients,
-                         std::vector<Gradient*>* result);
+  // Note: it is only valid to call Export once per tape, and after calling
+  // export the tape is no longer valid (i.e. calls to ShouldRecord, Watch,
+  // Record, and Delete have undefined behavior).
+  std::pair<TensorTape, OpTape> Export();
 
  private:
   TensorTape tensor_tape_;
-  OpTape<BackwardFunction> op_tape_;
+  OpTape op_tape_;
   int64 next_op_id_{0};
 
   // Map from tensor id to number of remaining usages (i.e. how many entries in
@@ -152,429 +90,6 @@ class GradientTape {
   std::unordered_map<int64, int64> tensor_usage_;
 };
 
-// Template instantiations here
-
-template <typename Gradient, typename BackwardFunction>
-bool GradientTape<Gradient, BackwardFunction>::ShouldRecord(
-    gtl::ArraySlice<int64> tensor_ids) {
-  for (int64 i : tensor_ids) {
-    if (tensor_tape_.find(i) != tensor_tape_.end()) {
-      return true;
-    }
-  }
-  return false;
-}
-
-template <typename Gradient, typename BackwardFunction>
-void GradientTape<Gradient, BackwardFunction>::Watch(int64 tensor_id) {
-  tensor_tape_.emplace(tensor_id, -1);
-}
-
-template <typename Gradient, typename BackwardFunction>
-void GradientTape<Gradient, BackwardFunction>::RecordOperation(
-    const string& op_type, gtl::ArraySlice<TapeTensor> output_tensors,
-    gtl::ArraySlice<int64> input_tensor_id, BackwardFunction* backward_function,
-    const std::function<void()>& backward_function_deleter) {
-  if (!ShouldRecord(input_tensor_id)) {
-    backward_function_deleter();
-    return;
-  }
-  std::vector<int64> ids;
-  ids.reserve(input_tensor_id.size());
-  for (int64 i : input_tensor_id) {
-    tensor_usage_[i]++;
-    ids.push_back(i);
-  }
-  const int64 op_id = next_op_id_++;
-  std::vector<TapeTensor> tensors;
-  tensors.reserve(output_tensors.size());
-  for (const TapeTensor& o : output_tensors) {
-    // Note: the tensor can have already been watched and hence be in the tape,
-    // so we cannot check that we're inserting it here.
-    tensor_tape_[o.id] = op_id;
-    tensor_usage_[o.id] = 1;
-    tensors.push_back(o);
-  }
-  op_tape_[op_id] = OpTapeEntry<BackwardFunction>{
-      op_type, tensors, ids, backward_function, backward_function_deleter};
-}
-
-template <typename Gradient, typename BackwardFunction>
-void GradientTape<Gradient, BackwardFunction>::DeleteTrace(int64 tensor_id) {
-  auto it = tensor_usage_.find(tensor_id);
-  if (it == tensor_usage_.end()) {
-    return;
-  }
-  it->second--;
-  if (it->second != 0) {
-    return;
-  }
-  tensor_usage_.erase(it);
-  auto tensor_op_it = tensor_tape_.find(tensor_id);
-  if (tensor_op_it == tensor_tape_.end()) {
-    return;
-  }
-  const int64 op_id = tensor_op_it->second;
-  if (op_id == -1) {
-    // Do not delete watched tensors.
-    return;
-  }
-  tensor_tape_.erase(tensor_op_it);
-  auto op_it = op_tape_.find(op_id);
-  CHECK(op_it != op_tape_.end());
-  for (const auto& output : op_it->second.output_tensor_info) {
-    if (tensor_usage_.find(output.id) != tensor_usage_.end()) {
-      // Found a usage for an output, so cannot delete the op.
-      return;
-    }
-  }
-  for (int64 id : op_it->second.input_tensor_id) {
-    DeleteTrace(id);
-  }
-  op_it->second.backward_function_deleter();
-  op_tape_.erase(op_it);
-}
-
-// Terminology:
-//
-//  - op: a possibly composite operation, which has an entry in the tape
-//  - target: dy in dx/dy
-//  - source: dx in dx/dy
-//  - tensor: one of the many inputs or outputs of an operation
-//
-// Below here we do the gradient algorithm. It works as follows:
-//
-// First we filter the tape to just the subset of operations we want to
-// differentiate. In the process of doing so we count how many times each Tensor
-// is used as an input to an op (so we know when we're done computing gradients
-// for that Tensor). We also count, for each tape entry, how many of its output
-// Tensors need gradients to be computed (Tensors which are not used do not need
-// any gradients to be computed).
-//
-// Finally, we start a backprop stack with a set of tape entries for which we
-// have all gradients available. This set usually is a subset of the set of
-// targets (not all since targets which have outputs in the tape will not have
-// gradients available initially).
-//
-// Then we repeatedly pop an entry from the stack, run its backprop, and update
-// the gradients of its inputs. Once we have computed all gradients for a single
-// input we can mark this input as done, and this can trigger adding an entry to
-// the stack if all outputs of that entry are now done.
-//
-// When the stack is empty we have gradients for all tensors we're interested
-// in.
-
-namespace {
-
-template <typename BackwardFunction>
-struct BackpropInitialState {
-  OpTape<BackwardFunction> op_tape;
-
-  // Map from tensor ID to how many references still exist for this tensor in
-  // the tape.
-  std::unordered_map<int64, int64> tensor_usage_counts;
-
-  // Maps from op ID to how many output tensors of this op still need to have
-  // their gradients computed.
-  std::unordered_map<int64, int64> op_missing_tensor;
-};
-
-template <typename BackwardFunction>
-BackpropInitialState<BackwardFunction> PrepareBackprop(
-    gtl::ArraySlice<int64> target, const TensorTape& tensor_tape,
-    OpTape<BackwardFunction> op_tape,
-    const std::unordered_set<int64>& sources_set) {
-  std::vector<int64> tensor_stack;
-  tensor_stack.reserve(target.size());
-  for (auto t : target) {
-    tensor_stack.push_back(t);
-  }
-  BackpropInitialState<BackwardFunction> result;
-  while (!tensor_stack.empty()) {
-    int64 tensor_id = tensor_stack.back();
-    tensor_stack.pop_back();
-    auto op_id_it = tensor_tape.find(tensor_id);
-    if (op_id_it == tensor_tape.end()) {
-      continue;
-    }
-    int64 op_id = op_id_it->second;
-    auto op_it = op_tape.find(op_id);
-    auto result_op_it = result.op_tape.find(op_id);
-    if (op_id == -1 || op_it == op_tape.end() ||
-        result_op_it != result.op_tape.end()) {
-      continue;
-    }
-    CHECK(result.op_tape.emplace(op_id, op_it->second).second);
-    for (auto it : op_it->second.input_tensor_id) {
-      auto count_it = result.tensor_usage_counts.find(it);
-      if (count_it != result.tensor_usage_counts.end()) {
-        count_it->second++;
-      } else {
-        result.tensor_usage_counts[it] = 1;
-        if (sources_set.find(it) == sources_set.end() &&
-            tensor_tape.find(it) != tensor_tape.end()) {
-          tensor_stack.push_back(it);
-        }
-      }
-    }
-    op_tape.erase(op_it);
-  }
-  for (auto& pair : result.tensor_usage_counts) {
-    auto it = tensor_tape.find(pair.first);
-    if (it != tensor_tape.end() && it->second != -1) {
-      result.op_missing_tensor[it->second] += 1;
-    }
-  }
-  // Call destructors for all unneeded gradient functions.
-  for (const auto& op_pair : op_tape) {
-    op_pair.second.backward_function_deleter();
-  }
-  return result;
-}
-
-template <typename BackwardFunction>
-std::vector<int64> InitialStack(
-    const OpTape<BackwardFunction>& op_tape,
-    const std::unordered_map<int64, int64>& op_missing_tensor) {
-  std::vector<int64> result;
-  for (auto& op_entry : op_tape) {
-    if (op_missing_tensor.find(op_entry.first) == op_missing_tensor.end()) {
-      result.push_back(op_entry.first);
-    }
-  }
-  return result;
-}
-
-template <typename Gradient, typename BackwardFunction>
-Status InitialGradients(
-    const VSpace<Gradient, BackwardFunction>& vspace,
-    gtl::ArraySlice<int64> target_tensor_ids,
-    gtl::ArraySlice<Gradient*> output_gradients, const TensorTape& tensor_tape,
-    const OpTape<BackwardFunction>& op_tape,
-    const std::unordered_map<int64, int64>& tensor_usage_counts,
-    std::unordered_map<int64, std::vector<Gradient*>>* result) {
-  for (int i = 0; i < target_tensor_ids.size(); ++i) {
-    const int64 id = target_tensor_ids[i];
-    if (tensor_usage_counts.find(id) != tensor_usage_counts.end()) {
-      if (!output_gradients.empty() && output_gradients[i] != nullptr) {
-        // TODO(apassos) figure out how to print debugging information here.
-        return errors::InvalidArgument(
-            "A gradient was provided for a tensor which is used as part of the "
-            "computation.");
-      }
-    } else {
-      if (output_gradients.empty() || output_gradients[i] == nullptr) {
-        auto tensor_it = tensor_tape.find(id);
-        if (tensor_it != tensor_tape.end() && tensor_it->second != -1) {
-          auto op_it = op_tape.find(tensor_it->second);
-          if (op_it == op_tape.end()) {
-            return errors::Internal(
-                "Internal state of the gradient tape is invalid.");
-          }
-          bool found = false;
-          for (int j = 0; j < op_it->second.output_tensor_info.size(); ++j) {
-            if (op_it->second.output_tensor_info[j].id == id) {
-              found = true;
-              (*result)[id].push_back(
-                  vspace.Ones(op_it->second.output_tensor_info[j].shape,
-                              op_it->second.output_tensor_info[j].dtype));
-              break;
-            }
-          }
-          if (!found) {
-            return errors::Internal(
-                "Internal state of the gradient tape is invalid.");
-          }
-        } else {
-          // No record of the target tensor found on the tape, so no gradient
-          // needs to be computed from it. Do nothing.
-        }
-      } else {
-        (*result)[id].push_back(output_gradients[i]);
-      }
-    }
-  }
-  return Status::OK();
-}
-
-}  // namespace
-
-// If over kMinAggregateCount gradients are accumulated and the total
-// memory consumption is over kMinAggregateBytes, do an early aggregation
-// so as to release the gradient tensor to save memory.
-constexpr int kMinAggregateCount = 4;
-constexpr int kMinAggregateBytes = 128 * 1024 * 1024;
-
-template <typename Gradient, typename BackwardFunction>
-Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
-    const VSpace<Gradient, BackwardFunction>& vspace,
-    gtl::ArraySlice<int64> target_tensor_ids,
-    gtl::ArraySlice<int64> source_tensor_ids,
-    gtl::ArraySlice<Gradient*> output_gradients,
-    std::vector<Gradient*>* result) {
-  std::unordered_set<int64> sources_set(source_tensor_ids.begin(),
-                                        source_tensor_ids.end());
-  BackpropInitialState<BackwardFunction> state = PrepareBackprop(
-      target_tensor_ids, tensor_tape_, std::move(op_tape_), sources_set);
-  std::vector<int64> op_stack =
-      InitialStack(state.op_tape, state.op_missing_tensor);
-  std::unordered_map<int64, std::vector<Gradient*>> gradients;
-  Status s = InitialGradients(vspace, target_tensor_ids, output_gradients,
-                              tensor_tape_, state.op_tape,
-                              state.tensor_usage_counts, &gradients);
-  auto cleanup = [&state]() {
-    // Release all backprop functions
-    for (const auto& pair : state.op_tape) {
-      pair.second.backward_function_deleter();
-    }
-  };
-  if (!s.ok()) {
-    cleanup();
-    return s;
-  }
-  std::unordered_map<int64, int64> gradients_size;
-  // TODO(apassos) multiple threads could be dequeuing from op_stack at the same
-  // time, for better CPU backprop performance.
-  VLOG(1) << "Initial stack:";
-  if (VLOG_IS_ON(1)) {
-    for (auto t : op_stack) {
-      VLOG(1) << "  " << t;
-    }
-  }
-  std::unordered_map<string, std::unordered_set<int>>
-      functions_accept_none_for_indices({
-          {"SoftmaxCrossEntropyWithLogits", {1}},
-          {"FusedBatchNorm", {1, 2, 3, 4}},
-      });
-  while (!op_stack.empty()) {
-    const int64 op = op_stack.back();
-    VLOG(1) << "Popped " << op;
-    op_stack.pop_back();
-    auto op_it = state.op_tape.find(op);
-    if (op_it == state.op_tape.end()) {
-      // It is possible for ops to end up on the stack if they are unrelated to
-      // the target; we should just skip them.
-      continue;
-    }
-    auto trace = std::move(op_it->second);
-    state.op_tape.erase(op_it);
-    std::vector<Gradient*> out_gradients;
-    out_gradients.reserve(trace.output_tensor_info.size());
-    for (int i = 0; i < trace.output_tensor_info.size(); ++i) {
-      const int64 id = trace.output_tensor_info[i].id;
-      auto grad_it = gradients.find(id);
-      if (grad_it == gradients.end()) {
-        auto func_name_it =
-            functions_accept_none_for_indices.find(trace.op_type);
-        if (func_name_it != functions_accept_none_for_indices.end() &&
-            func_name_it->second.find(i) != func_name_it->second.end()) {
-          out_gradients.push_back(nullptr);
-        } else {
-          out_gradients.push_back(
-              vspace.Zeros(trace.output_tensor_info[i].shape,
-                           trace.output_tensor_info[i].dtype));
-        }
-      } else {
-        out_gradients.push_back(vspace.AggregateGradients(grad_it->second));
-        if (sources_set.find(grad_it->first) == sources_set.end()) {
-          gradients.erase(grad_it);
-        }
-      }
-    }
-    std::vector<Gradient*> in_gradients;
-    Status s = vspace.CallBackwardFunction(trace.backward_function,
-                                           out_gradients, &in_gradients);
-    if (!s.ok()) {
-      VLOG(1) << "Gradient function failed.";
-      cleanup();
-      return s;
-    }
-    VLOG(1) << "Got " << in_gradients.size() << " in_gradients for "
-            << trace.input_tensor_id.size() << " sources";
-    for (int i = 0; i < in_gradients.size(); ++i) {
-      const int64 id = trace.input_tensor_id[i];
-      if (in_gradients[i] != nullptr) {
-        auto& unaggregated_grads = gradients[id];
-        unaggregated_grads.push_back(in_gradients[i]);
-        if (unaggregated_grads.size() > kMinAggregateCount) {
-          auto size_it = gradients_size.find(id);
-          int64 size;
-          if (size_it == gradients_size.end()) {
-            size = vspace.NumElements(unaggregated_grads[0]);
-            gradients_size.emplace(id, size);
-          } else {
-            size = size_it->second;
-          }
-          if (unaggregated_grads.size() * size * 4 > kMinAggregateBytes) {
-            Gradient* grad = vspace.AggregateGradients(unaggregated_grads);
-            unaggregated_grads.clear();
-            unaggregated_grads.push_back(grad);
-          }
-        }
-      }
-      auto usage_count_it = state.tensor_usage_counts.find(id);
-      if (usage_count_it == state.tensor_usage_counts.end()) {
-        VLOG(1) << "Tensor " << id << " not used";
-        continue;
-      }
-      usage_count_it->second--;
-      if (usage_count_it->second > 0) {
-        VLOG(1) << "Tensor " << id << " usage count " << usage_count_it->second;
-        continue;
-      }
-      auto tape_it = tensor_tape_.find(id);
-      if (tape_it == tensor_tape_.end()) {
-        VLOG(1) << "Tensor " << id
-                << " has no associated op. Deleting gradient";
-        auto grad_it = gradients.find(id);
-        if (grad_it != gradients.end()) {
-          for (auto g : grad_it->second) {
-            vspace.DeleteGradient(g);
-          }
-          gradients.erase(grad_it);
-        }
-        continue;
-      }
-      const int64 op_id = tape_it->second;
-      if (op_id == -1) {
-        VLOG(1) << "Tensor " << id << " is source";
-        continue;
-      }
-      auto missing_it = state.op_missing_tensor.find(op_id);
-      if (missing_it != state.op_missing_tensor.end()) {
-        missing_it->second--;
-        VLOG(1) << "Op " << op_id << " missing " << missing_it->second
-                << " output gradients";
-        if (missing_it->second == 0) {
-          op_stack.push_back(op_id);
-        }
-      }
-    }
-  }
-  CHECK(state.op_tape.empty());
-  result->reserve(source_tensor_ids.size());
-  for (auto is : source_tensor_ids) {
-    auto grad_it = gradients.find(is);
-    if (grad_it == gradients.end()) {
-      result->push_back(nullptr);
-    } else {
-      if (grad_it->second.size() == 1) {
-        result->push_back(grad_it->second[0]);
-      } else {
-        result->push_back(vspace.AggregateGradients(grad_it->second));
-      }
-      gradients.erase(grad_it);
-    }
-  }
-  VLOG(1) << "Final gradients size: " << gradients.size();
-  for (auto grad_pair : gradients) {
-    for (const auto& g : grad_pair.second) {
-      vspace.DeleteGradient(g);
-    }
-  }
-  return Status::OK();
-}
-
 }  // namespace eager
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index ee291c12d0..1e22b760b8 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -119,7 +119,7 @@ def tf_library(name, graph, config,
             out_nodes_file,
         ] + freeze_saver_srcs,
         outs=[freeze_file],
-        cmd=("$(location //tensorflow/python/tools:freeze_graph)" +
+        cmd=("$(location @org_tensorflow//tensorflow/python/tools:freeze_graph)" +
              freeze_args),
         tools=["@org_tensorflow//tensorflow/python/tools:freeze_graph"],
         tags=tags,
@@ -130,6 +130,10 @@ def tf_library(name, graph, config,
   header_file = name + ".h"
   object_file = name + ".o"
   ep = ("__" + PACKAGE_NAME + "__" + name).replace("/", "_")
+  if type(tfcompile_flags) == type(""):
+    flags = tfcompile_flags
+  else:
+    flags = " ".join(["'" + arg.replace("'", "'\\''") + "'" for arg in (tfcompile_flags or [])])
   native.genrule(
       name=("gen_" + name),
       srcs=[
@@ -148,7 +152,7 @@ def tf_library(name, graph, config,
            " --target_triple=" + target_llvm_triple() +
            " --out_header=$(@D)/" + header_file +
            " --out_object=$(@D)/" + object_file +
-           " " + (tfcompile_flags or "")),
+           flags),
       tools=[tfcompile_tool],
       visibility=visibility,
       testonly=testonly,
@@ -185,7 +189,7 @@ def tf_library(name, graph, config,
            " --cpp_class=" + cpp_class +
            " --target_triple=" + target_llvm_triple() +
            " --out_session_module=$(@D)/" + session_module_pb +
-           " " + (tfcompile_flags or "")),
+           flags),
       tools=[tfcompile_tool],
       visibility=visibility,
       testonly=testonly,
@@ -195,8 +199,7 @@ def tf_library(name, graph, config,
 
   # The cc_library rule packaging up the header and object file, and needed
   # kernel implementations.
-  need_xla_data_proto = (tfcompile_flags and
-                         tfcompile_flags.find("--gen_program_shape") != -1)
+  need_xla_data_proto = (flags and flags.find("--gen_program_shape") != -1)
   native.cc_library(
       name=name,
       srcs=[object_file],
@@ -253,7 +256,7 @@ def tf_library(name, graph, config,
         ],
         outs=[test_file],
         cmd=("sed " + sed_replace +
-             " $(location //tensorflow/compiler/aot:test.cc) " +
+             " $(location @org_tensorflow//tensorflow/compiler/aot:test.cc) " +
              "> $(OUTS)"),
         tags=tags,
     )
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index e481796d9e..27c5da08c1 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -257,6 +257,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
   options.graph_def_version = ctx->function_library()->graph_def_version();
   options.allow_cpu_custom_calls = (platform_id_ == gpu::host::kHostPlatformId);
+  options.local_executable_has_hybrid_result = true;
 
   const XlaCompiler::CompilationResult* kernel;
   xla::LocalExecutable* executable;
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index bc2eccd277..23368b6c76 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -227,7 +227,10 @@ Status XlaCompilationCache::BuildExecutable(
   }
   xla::ExecutableBuildOptions build_options;
   build_options.set_device_ordinal(client_->default_device_ordinal());
+  build_options.set_platform(client_->platform());
   build_options.set_result_layout(result.xla_output_shape);
+  build_options.set_has_hybrid_result(
+      options.local_executable_has_hybrid_result);
 
   auto compile_result =
       client_->Compile(*result.computation, argument_layouts, build_options);
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 21b8823944..284ecbf97d 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -657,7 +657,7 @@ tf_library(
     cpp_class = "LSTMLayerInference",
     graph = "lstm_layer_inference.pbtxt",
     tags = ["manual"],
-    tfcompile_flags = "--xla_cpu_multi_thread_eigen=false",
+    tfcompile_flags = ["--xla_cpu_multi_thread_eigen=false"],
 )
 
 # -----------------------------------------------------------------------------
diff --git a/tensorflow/compiler/tests/fused_batchnorm_test.py b/tensorflow/compiler/tests/fused_batchnorm_test.py
index 936fcf8b6b..a773b5a947 100644
--- a/tensorflow/compiler/tests/fused_batchnorm_test.py
+++ b/tensorflow/compiler/tests/fused_batchnorm_test.py
@@ -36,7 +36,7 @@ class FusedBatchNormTest(XLATestCase):
     x_square = x * x
     x_square_sum = np.sum(x_square, (0, 1, 2))
     x_sum = np.sum(x, axis=(0, 1, 2))
-    element_count = np.size(x) / int(np.shape(x)[0])
+    element_count = np.size(x) / int(np.shape(x)[-1])
     mean = x_sum / element_count
     var = x_square_sum / element_count - mean * mean
     normalized = (x - mean) / np.sqrt(var + epsilon)
@@ -64,8 +64,9 @@ class FusedBatchNormTest(XLATestCase):
     return grad_x, grad_scale, grad_offset
 
   def testInference(self):
-    x_shape = [2, 2, 6, 2]
-    scale_shape = [2]
+    channel = 3
+    x_shape = [2, 2, 6, channel]
+    scale_shape = [channel]
     x_val = np.random.random_sample(x_shape).astype(np.float32)
     scale_val = np.random.random_sample(scale_shape).astype(np.float32)
 
@@ -74,8 +75,8 @@ class FusedBatchNormTest(XLATestCase):
     with self.test_session() as sess, self.test_scope():
       # To avoid constant folding
       t_val = array_ops.placeholder(np.float32, shape=x_shape, name="x")
-      scale = array_ops.placeholder(np.float32, shape=[2], name="scale")
-      offset = array_ops.placeholder(np.float32, shape=[2], name="offset")
+      scale = array_ops.placeholder(np.float32, shape=scale_shape, name="scale")
+      offset = array_ops.placeholder(np.float32, shape=scale_shape, name="offset")
       epsilon = 0.001
       y_ref, mean_ref, var_ref = self._reference_training(
           x_val, scale_val, offset_val, epsilon, data_format)
@@ -97,8 +98,9 @@ class FusedBatchNormTest(XLATestCase):
       self.assertAllClose(y_val, y_ref, atol=1e-3)
 
   def _testLearning(self, use_gradient_checker):
-    x_shape = [2, 2, 6, 2]
-    scale_shape = [2]
+    channel = 3
+    x_shape = [2, 2, 6, channel]
+    scale_shape = [channel]
     x_val = np.random.random_sample(x_shape).astype(np.float32)
     scale_val = np.random.random_sample(scale_shape).astype(np.float32)
 
@@ -109,8 +111,8 @@ class FusedBatchNormTest(XLATestCase):
     with self.test_session() as sess, self.test_scope():
       # To avoid constant folding
       t_val = array_ops.placeholder(np.float32, shape=x_shape, name="x")
-      scale = array_ops.placeholder(np.float32, shape=[2], name="scale")
-      offset = array_ops.placeholder(np.float32, shape=[2], name="offset")
+      scale = array_ops.placeholder(np.float32, shape=scale_shape, name="scale")
+      offset = array_ops.placeholder(np.float32, shape=scale_shape, name="offset")
       epsilon = 0.001
       y, mean, var = nn.fused_batch_norm(
           t_val,
@@ -154,8 +156,9 @@ class FusedBatchNormTest(XLATestCase):
   def testGradient(self):
     # TODO(b/64270657): Use gradient_checker here in addition to comparing with
     # this reference implementation.
-    x_shape = [2, 2, 6, 2]
-    scale_shape = [2]
+    channel = 3
+    x_shape = [2, 2, 6, channel]
+    scale_shape = [channel]
     grad_val = np.random.random_sample(x_shape).astype(np.float32)
     x_val = np.random.random_sample(x_shape).astype(np.float32)
     scale_val = np.random.random_sample(scale_shape).astype(np.float32)
diff --git a/tensorflow/compiler/tf2xla/type_util.cc b/tensorflow/compiler/tf2xla/type_util.cc
index c969212a1b..1efbe0ffb1 100644
--- a/tensorflow/compiler/tf2xla/type_util.cc
+++ b/tensorflow/compiler/tf2xla/type_util.cc
@@ -49,9 +49,6 @@ Status DataTypeToPrimitiveType(DataType data_type, xla::PrimitiveType* type) {
     case tensorflow::DT_UINT64:
       *type = xla::U64;
       return Status::OK();
-    case tensorflow::DT_BFLOAT16:
-      *type = xla::BF16;
-      return Status::OK();
     case tensorflow::DT_HALF:
       *type = xla::F16;
       return Status::OK();
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index ac7d4cfb12..4d40ca5825 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -236,6 +236,12 @@ class XlaCompiler {
     // to the computation.
     bool allow_cpu_custom_calls = false;
 
+    // If 'local_executable_has_hybrid_result', the top-level pointers of the
+    // result tuple of compiled programs are stored in host memory and the
+    // nested buffers in device memory, otherwise the whole result tuple is
+    // stored in device memory.
+    bool local_executable_has_hybrid_result = false;
+
     // If not nullptr, populate_resource_manager is called with the
     // compilation device's resource manager when the compilation
     // device is created, and can be used to create metadata objects
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 515b572b0e..660f419e46 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -77,7 +77,6 @@ cc_library(
     hdrs = ["types.h"],
     visibility = [":friends"],
     deps = [
-        "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib",
         "//third_party/eigen3",
     ],
@@ -340,7 +339,6 @@ cc_library(
     name = "array",
     hdrs = ["array.h"],
     deps = [
-        ":status",
         ":types",
         "//tensorflow/core:lib",
     ],
diff --git a/tensorflow/compiler/xla/array.h b/tensorflow/compiler/xla/array.h
index 213e0bac6c..ba898d1f4e 100644
--- a/tensorflow/compiler/xla/array.h
+++ b/tensorflow/compiler/xla/array.h
@@ -23,10 +23,8 @@ limitations under the License.
 #include <iterator>
 #include <memory>
 #include <random>
-#include <type_traits>
 #include <vector>
 
-#include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -37,63 +35,10 @@ limitations under the License.
 
 namespace xla {
 
-namespace array_impl {
-
-// conjunction
-//
-// Performs a compile-time logical AND operation on the passed types (which
-// must have  `::value` members convertible to `bool`. Short-circuits if it
-// encounters any `false` members (and does not compare the `::value` members
-// of any remaining arguments).
-//
-// This metafunction is designed to be a drop-in replacement for the C++17
-// `std::conjunction` metafunction.
-template <typename... Ts>
-struct conjunction;
-
-template <typename T, typename... Ts>
-struct conjunction<T, Ts...>
-    : std::conditional<T::value, conjunction<Ts...>, T>::type {};
-
-template <>
-struct conjunction<> : std::true_type {};
-
-// A type trait that is valid when all elements in a parameter pack are of
-// integral type.
-template <typename... T>
-using pack_is_integral = conjunction<std::is_integral<T>...>;
-
-// Compares three same-sized vectors elementwise. For each item in `values`,
-// returns false if any of values[i] is outside the half-open range [starts[i],
-// ends[i]).
-template <typename C1, typename C2, typename C3>
-bool all_inside_range(const C1& values, const C2& range_starts,
-                      const C3& range_ends) {
-  for (size_t i = 0, e = values.size(); i < e; ++i) {
-    if (values[i] < range_starts[i] || values[i] >= range_ends[i]) {
-      return false;
-    }
-  }
-  return true;
-}
-
-}  // namespace array_impl
-
 // General N dimensional array class with arbitrary value type.
 template <typename T>
 class Array {
  public:
-  // Type inference can have a hard time parsing very deep initializer list
-  // nests, especially if one or more dimensions is one as the compiler just
-  // sees a single-element integer initializer. These typedefs allow casting
-  // explicitly with less typing.
-  using InitializerList1D = std::initializer_list<T>;
-  using InitializerList2D = std::initializer_list<InitializerList1D>;
-  using InitializerList3D = std::initializer_list<InitializerList2D>;
-  using InitializerList4D = std::initializer_list<InitializerList3D>;
-
-  using value_type = T;
-
   // Creates a new array with the specified dimensions.
   explicit Array(tensorflow::gtl::ArraySlice<int64> sizes)
       : Array(sizes, T()) {}
@@ -108,7 +53,7 @@ class Array {
   // Creates a 2D array from the given nested initializer list. The outer
   // initializer list is the first dimension, the inner is the second dimension.
   // For example, {{1, 2, 3}, {4, 5, 6}} results in an array with n1=2 and n2=3.
-  Array(InitializerList2D values)
+  Array(std::initializer_list<std::initializer_list<T>> values)
       : Array(ToInt64Vector({values.size(), values.begin()->size()})) {
     int64 idx = 0;
     for (const auto& it1 : values) {
@@ -122,7 +67,8 @@ class Array {
 
   // Creates a 3D array from the given nested initializer list. The outer
   // initializer list is the first dimension, and so on.
-  Array(InitializerList3D values)
+  Array(std::initializer_list<std::initializer_list<std::initializer_list<T>>>
+            values)
       : Array(ToInt64Vector({values.size(), values.begin()->size(),
                              values.begin()->begin()->size()})) {
     int64 idx = 0;
@@ -139,7 +85,9 @@ class Array {
 
   // Creates a 4D array from the given nested initializer list. The outer
   // initializer list is the first dimension, and so on.
-  Array(InitializerList4D values)
+  Array(std::initializer_list<
+        std::initializer_list<std::initializer_list<std::initializer_list<T>>>>
+            values)
       : Array(ToInt64Vector({values.size(), values.begin()->size(),
                              values.begin()->begin()->size(),
                              values.begin()->begin()->begin()->size()})) {
@@ -225,46 +173,10 @@ class Array {
     }
   }
 
-  // Invokes a callback with the (indices, value_ptr) for each cell in the
-  // array. If a callback returns a non-OK status, returns that else returns
-  // Status::OK().
-  Status EachStatus(
-      std::function<Status(tensorflow::gtl::ArraySlice<int64>, T*)> f) {
-    std::vector<int64> index(sizes_.size());
-    for (int64 i = 0; i < num_elements(); ++i, next_index(&index)) {
-      Status s = f(index, &values_[i]);
-      if (!s.ok()) {
-        return s;
-      }
-    }
-    return Status::OK();
-  }
-
-  // Invokes a callback with the (indices, value) for each cell in the array.
-  // If a callback returns a non-OK status, returns that else returns
-  // Status::OK().
-  Status EachStatus(
-      std::function<Status(tensorflow::gtl::ArraySlice<int64>, T)> f) const {
-    std::vector<int64> index(sizes_.size());
-    for (int64 i = 0; i < num_elements(); ++i, next_index(&index)) {
-      Status s = f(index, values_[i]);
-      if (!s.ok()) {
-        return s;
-      }
-    }
-    return Status::OK();
-  }
-
   // Returns the value at the cell specified by the indexes. The number of
   // arguments have to match with the number of dimensions for the array.
-  //
-  // The type trait is required to avoid this overload participating too
-  // eagerly; a parameter pack can take zero or more elements, so we must
-  // restrict this to only parameter packs that are all of integral type.
   template <typename... Dims>
-  typename std::enable_if<array_impl::pack_is_integral<Dims...>::value,
-                          const T&>::type
-  operator()(Dims... dims) const {
+  const T& operator()(Dims... dims) const {
     // We are using a std::array to avoid having to allocate memory in this
     // function for performance reasons.
     std::array<int64, sizeof...(dims)> indexes{{static_cast<int64>(dims)...}};
@@ -274,9 +186,7 @@ class Array {
   // Returns the value at the cell specified by the indexes. The number of
   // arguments have to match with the number of dimensions for the array.
   template <typename... Dims>
-  typename std::enable_if<array_impl::pack_is_integral<Dims...>::value,
-                          T&>::type
-  operator()(Dims... dims) {
+  T& operator()(Dims... dims) {
     // We are using a std::array to avoid having to allocate memory in this
     // function for performance reasons.
     std::array<int64, sizeof...(dims)> indexes{{static_cast<int64>(dims)...}};
@@ -345,59 +255,6 @@ class Array {
 
   bool operator!=(const Array<T>& other) const { return !(*this == other); }
 
-  // Performs the equivalent of a slice operation on this array.
-  Array<T> Slice(tensorflow::gtl::ArraySlice<int64> starts,
-                 tensorflow::gtl::ArraySlice<int64> limits) const {
-    CHECK_EQ(starts.size(), num_dimensions());
-    CHECK_EQ(limits.size(), num_dimensions());
-
-    std::vector<int64> sizes;
-    std::transform(starts.begin(), starts.end(), limits.begin(),
-                   std::back_inserter(sizes),
-                   [](int64 start, int64 limit) { return limit - start; });
-    Array<T> result(sizes);
-
-    std::vector<int64> index(sizes_.size());
-    int64 slice_i = 0;
-    for (int64 i = 0; i < num_elements(); ++i, next_index(&index)) {
-      if (array_impl::all_inside_range(index, starts, limits)) {
-        // Even though the bounds of result are different to our bounds, we're
-        // iterating in the same order. So we can simply write successive linear
-        // indices instead of recalculating a multi-dimensional index.
-        result.values_[slice_i++] = values_[i];
-      }
-    }
-    return result;
-  }
-
-  // Performs the equivalent of a DynamicUpdateSlice in-place on this array.
-  void UpdateSlice(const Array<T>& from,
-                   tensorflow::gtl::ArraySlice<int64> start_indices) {
-    CHECK_EQ(from.num_dimensions(), num_dimensions());
-    std::vector<int64> limit_indices;
-    std::transform(start_indices.begin(), start_indices.end(),
-                   from.dimensions().begin(), std::back_inserter(limit_indices),
-                   std::plus<int64>{});
-    std::vector<int64> index(sizes_.size());
-    int64 from_i = 0;
-    for (int64 i = 0; i < num_elements(); ++i, next_index(&index)) {
-      if (array_impl::all_inside_range(index, start_indices, limit_indices)) {
-        // Even though the bounds of from are different to our bounds, we're
-        // iterating in the same order. So we can simply write successive linear
-        // indices instead of recalculating a multi-dimensional index.
-        values_[i] = from.values_[from_i++];
-      }
-    }
-  }
-
-  // Performs an in-place reshape, modifying the dimensions but not the
-  // underlying data.
-  void Reshape(tensorflow::gtl::ArraySlice<int64> new_dimensions) {
-    int64 old_num_elements = num_elements();
-    sizes_ = std::vector<int64>(new_dimensions.begin(), new_dimensions.end());
-    CHECK_EQ(num_elements(), old_num_elements);
-  }
-
   // Returns a string representation of the array suitable for debugging.
   string ToString() const {
     std::vector<string> pieces;
diff --git a/tensorflow/compiler/xla/array_test.cc b/tensorflow/compiler/xla/array_test.cc
index 8b94194774..093784f541 100644
--- a/tensorflow/compiler/xla/array_test.cc
+++ b/tensorflow/compiler/xla/array_test.cc
@@ -71,19 +71,6 @@ TEST(ArrayTest, IndexingReadWrite) {
   EXPECT_EQ(arr(1, 2), 61);
 }
 
-TEST(ArrayTest, DynamicIndexingReadWrite) {
-  Array<int> arr({2, 3});
-
-  std::vector<int64> index1 = {1, 1};
-  std::vector<int64> index2 = {1, 2};
-  EXPECT_EQ(arr(index1), 0);
-  EXPECT_EQ(arr(index2), 0);
-  arr(index1) = 51;
-  arr(index2) = 61;
-  EXPECT_EQ(arr(1, 1), 51);
-  EXPECT_EQ(arr(1, 2), 61);
-}
-
 TEST(ArrayTest, IndexingReadWriteBool) {
   Array<bool> arr{{false, true, false}, {false, true, false}};
 
@@ -154,37 +141,5 @@ TEST(ArrayTest, Each) {
   EXPECT_EQ(arr.num_elements() * (arr.num_elements() - 1) / 2, each_sum);
 }
 
-TEST(ArrayTest, Slice) {
-  Array<int64> arr({2, 4});
-  arr.FillWithMultiples(1);
-
-  Array<int64> identity_slice = arr.Slice({0, 0}, {2, 4});
-  EXPECT_EQ(identity_slice.dimensions(), arr.dimensions());
-  for (auto it1 = arr.begin(), it2 = identity_slice.begin(), e = arr.end();
-       it1 != e; ++it1, ++it2) {
-    EXPECT_EQ(*it1, *it2);
-  }
-
-  Array<int64> sub_slice = arr.Slice({1, 0}, {2, 2});
-  EXPECT_EQ(sub_slice.dimensions(), (std::vector<int64>{1, 2}));
-  const string expected = R"([[4, 5]])";
-  EXPECT_EQ(expected, sub_slice.ToString());
-}
-
-TEST(ArrayTest, UpdateSlice) {
-  Array<int64> arr({3, 4});
-  arr.FillWithMultiples(1);
-
-  Array<int64> sub_arr({2, 2});
-  sub_arr.FillWithMultiples(3);
-
-  arr.UpdateSlice(sub_arr, {1, 1});
-
-  const string expected = R"([[0, 1, 2, 3],
- [4, 0, 3, 7],
- [8, 6, 9, 11]])";
-  EXPECT_EQ(expected, arr.ToString());
-}
-
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index 66937d64af..92cd8e729d 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -142,7 +142,8 @@ StatusOr<std::unique_ptr<Literal>> Client::TransferFromOutfeed(
         "TransferToClient request");
   }
 
-  return MakeUnique<Literal>(response.literal());
+  Literal literal(response.literal());
+  return MakeUnique<Literal>(literal);
 }
 
 Status Client::ResetDevice() {
diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
index 4c6e320557..8e1b4be1f3 100644
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ b/tensorflow/compiler/xla/client/computation_builder.h
@@ -68,7 +68,6 @@ class ShardingBuilder {
                          const TileAssignment& tile_assignment) {
     OpSharding result;
     result.set_type(OpSharding::Type::OpSharding_Type_OTHER);
-    *result.mutable_tile_shape() = tile_shape;
     for (int64 dim : tile_assignment.dimensions()) {
       result.add_tile_assignment_dimensions(dim);
     }
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index fca2bf2688..ee34682087 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -44,7 +44,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
     ],
 )
diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc
index d936bd870b..e6645e4941 100644
--- a/tensorflow/compiler/xla/client/lib/testing.cc
+++ b/tensorflow/compiler/xla/client/lib/testing.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -49,6 +48,62 @@ std::unique_ptr<GlobalData> MakeFakeDataViaDeviceOrDie(const Shape& shape,
 
 }  // namespace
 
+StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape) {
+  if (ShapeUtil::IsTuple(shape)) {
+    std::vector<std::unique_ptr<Literal>> elements;
+    for (const Shape& element_shape : shape.tuple_shapes()) {
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> element,
+                          MakeFakeLiteral(element_shape));
+      elements.push_back(std::move(element));
+    }
+    return Literal::MakeTupleOwned(std::move(elements));
+  }
+  std::unique_ptr<Literal> literal = Literal::CreateFromShape(shape);
+  std::minstd_rand0 engine;
+  switch (shape.element_type()) {
+    case F32: {
+      std::uniform_real_distribution<float> generator(0.0f, 1.0f);
+      TF_CHECK_OK(literal->Populate<float>(
+          [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
+            return generator(engine);
+          }));
+      break;
+    }
+    case S32: {
+      std::uniform_int_distribution<int32> generator(
+          std::numeric_limits<int32>::lowest(),
+          std::numeric_limits<int32>::max());
+      TF_CHECK_OK(literal->Populate<int32>(
+          [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
+            return generator(engine);
+          }));
+      break;
+    }
+    case S64: {
+      std::uniform_int_distribution<int64> generator(
+          std::numeric_limits<int64>::lowest(),
+          std::numeric_limits<int64>::max());
+      TF_CHECK_OK(literal->Populate<int64>(
+          [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
+            return generator(engine);
+          }));
+      break;
+    }
+    case PRED: {
+      std::uniform_int_distribution<int> generator(0, 1);
+      TF_CHECK_OK(literal->Populate<bool>(
+          [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
+            return generator(engine);
+          }));
+      break;
+    }
+    default:
+      return Unimplemented("Unsupported type for fake literal generation: %s",
+                           ShapeUtil::HumanString(shape).c_str());
+  }
+  return std::move(literal);
+}
+
 std::unique_ptr<GlobalData> MakeFakeDataOrDie(const Shape& shape,
                                               Client* client) {
   if (ShapeUtil::ByteSizeOf(shape) < (1LL << 30)) {
diff --git a/tensorflow/compiler/xla/client/lib/testing.h b/tensorflow/compiler/xla/client/lib/testing.h
index 7e640d1307..b5c4393dcc 100644
--- a/tensorflow/compiler/xla/client/lib/testing.h
+++ b/tensorflow/compiler/xla/client/lib/testing.h
@@ -26,6 +26,10 @@ limitations under the License.
 
 namespace xla {
 
+// Generates fake data in a literal of the given shape, or returns an error
+// status if the element type is currently unhandled for fake data generation.
+StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape);
+
 // Generates fake data of the given shape on the device or dies. The fake data
 // is created by performing a computation on the device rather than transferring
 // data from the host to the device.
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index b50425a09c..15c744ecd3 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -27,6 +27,16 @@ namespace se = ::perftools::gputools;
 
 namespace xla {
 
+ExecutableBuildOptions& ExecutableBuildOptions::set_platform(
+    perftools::gputools::Platform* platform) {
+  platform_ = platform;
+  return *this;
+}
+
+perftools::gputools::Platform* ExecutableBuildOptions::platform() const {
+  return platform_;
+}
+
 ExecutableBuildOptions& ExecutableBuildOptions::set_device_ordinal(
     int device_ordinal) {
   device_ordinal_ = device_ordinal;
@@ -46,6 +56,16 @@ const Shape* ExecutableBuildOptions::result_layout() const {
   return result_layout_set_ ? &result_layout_ : nullptr;
 }
 
+ExecutableBuildOptions& ExecutableBuildOptions::set_has_hybrid_result(
+    bool has_hybrid_result) {
+  has_hybrid_result_ = has_hybrid_result;
+  return *this;
+}
+
+bool ExecutableBuildOptions::has_hybrid_result() const {
+  return has_hybrid_result_;
+}
+
 namespace {
 StatusOr<Backend::StreamPtr> BorrowStreamForDevice(int device_ordinal,
                                                    Backend* backend) {
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index e9eeaa0aa2..9f985ed527 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -37,6 +37,14 @@ namespace xla {
 // LocalClient::Compile.
 class ExecutableBuildOptions {
  public:
+  // If set, this is the platform to build the computation for. This must match
+  // the underlying platform of the service. A value of nullptr indicates the
+  // option has not been set.
+  //
+  // TODO(b/28616830): Support multiple platforms.
+  ExecutableBuildOptions& set_platform(perftools::gputools::Platform* platform);
+  perftools::gputools::Platform* platform() const;
+
   // If set, this is the device to build the computation for. Valid
   // device_ordinal values are: 0 to # of devices - 1. These values are
   // identical to the device ordinal values used by StreamExecutor. The built
@@ -53,10 +61,18 @@ class ExecutableBuildOptions {
   ExecutableBuildOptions& set_result_layout(const Shape& shape_with_layout);
   const Shape* result_layout() const;
 
+  // If set, the executable will be built to output a hybrid
+  // ShapedBuffer with top-level tuple pointers in host memory and
+  // result buffers in device memory.
+  ExecutableBuildOptions& set_has_hybrid_result(bool has_hybrid_result);
+  bool has_hybrid_result() const;
+
  private:
+  perftools::gputools::Platform* platform_ = nullptr;
   int device_ordinal_ = -1;
   Shape result_layout_;
   bool result_layout_set_ = false;
+  bool has_hybrid_result_ = true;
 };
 
 class LocalExecutable {
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 93d3cd425f..fda791401d 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -33,20 +33,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
-namespace {
-using tensorflow::int64;
-
-constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
-
-// Converts between little and big endian, assuming elements in the array are 16
-// bits long.
-void ConvertEndianShort(char* bytes, int64 size) {
-  CHECK_EQ(size / 2, 0);
-  for (int64 i = 0; i < size; i += 2) {
-    std::swap(bytes[i], bytes[i + 1]);
-  }
-}
-}  // namespace
 
 namespace xla {
 
@@ -183,8 +169,6 @@ Status Literal::Copy(const Literal& src_literal,
       return CopyRange<int64>(src_literal, src_base, dest_base, copy_size);
     case F16:
       return CopyRange<half>(src_literal, src_base, dest_base, copy_size);
-    case BF16:
-      return CopyRange<bfloat16>(src_literal, src_base, dest_base, copy_size);
     case F32:
       return CopyRange<float>(src_literal, src_base, dest_base, copy_size);
     case F64:
@@ -216,8 +200,6 @@ Status Literal::Copy(const Literal& src_literal,
       return *Literal::CreateR0<int64>(0);
     case F16:
       return *Literal::CreateR0<half>(static_cast<half>(0.0f));
-    case BF16:
-      return *Literal::CreateR0<bfloat16>(static_cast<bfloat16>(0.0f));
     case F32:
       return *Literal::CreateR0<float>(0);
     case F64:
@@ -303,9 +285,6 @@ Status Literal::Copy(const Literal& src_literal,
     case F16:
       return *Literal::CreateR0<half>(
           static_cast<half>(-std::numeric_limits<float>::infinity()));
-    case BF16:
-      return *Literal::CreateR0<bfloat16>(
-          static_cast<bfloat16>(-std::numeric_limits<float>::infinity()));
     case TUPLE:
       LOG(FATAL) << "tuple element type has no minimum value";
     case OPAQUE:
@@ -342,9 +321,6 @@ Status Literal::Copy(const Literal& src_literal,
     case F16:
       return *Literal::CreateR0<half>(
           static_cast<half>(std::numeric_limits<float>::infinity()));
-    case BF16:
-      return *Literal::CreateR0<bfloat16>(
-          static_cast<bfloat16>(std::numeric_limits<float>::infinity()));
     case TUPLE:
       LOG(FATAL) << "tuple element type has no maximum value";
     case OPAQUE:
@@ -452,7 +428,6 @@ std::unique_ptr<Literal> Literal::Transpose(
   // The shape with affine layout resulting from that operation will be
   // F32[8,11]{0,1}, since it leaves the original most minor (the 8 sized), the
   // most minor.
-  //
   // Essentially, given MinMaj(Di) the position of the Di dimension within the
   // minor to major vector, and given T(Di) the index that the original Di
   // dimension has within the transposed array, a layout is affine if
@@ -561,9 +536,6 @@ string Literal::GetAsString(
     }
     case F16:
       return tensorflow::strings::StrCat(Get<half>(multi_index));
-    case BF16:
-      return tensorflow::strings::StrCat(
-          static_cast<float>(Get<bfloat16>(multi_index)));
     default:
       return tensorflow::strings::StrCat(
           "[", PrimitiveType_Name(shape().element_type()), "]");
@@ -597,17 +569,9 @@ int64 Literal::LinearIndex(
   return IndexUtil::MultidimensionalIndexToLinearIndex(shape(), multi_index);
 }
 
-string Literal::ToString(bool print_layout) const {
+string Literal::ToString() const {
   std::vector<string> pieces;
 
-  auto shape_to_string = [print_layout](const Shape& shape) {
-    if (print_layout) {
-      return ShapeUtil::HumanStringWithLayout(shape);
-    } else {
-      return ShapeUtil::HumanString(shape);
-    }
-  };
-
   auto element_to_string =
       [this](tensorflow::gtl::ArraySlice<int64> indices) -> string {
     PrimitiveType element_type = shape().element_type();
@@ -621,7 +585,7 @@ string Literal::ToString(bool print_layout) const {
 
   // TODO(b/32894291): refactor this code to reduce code duplication.
   if (ShapeUtil::IsTuple(shape())) {
-    pieces.push_back(shape_to_string(shape()));
+    pieces.push_back(ShapeUtil::HumanString(shape()));
     pieces.push_back(" (\n");
     pieces.push_back(tensorflow::str_util::Join(
         tuple_literals(), ",\n", [](string* out, const Literal& element) {
@@ -637,7 +601,7 @@ string Literal::ToString(bool print_layout) const {
     }
     pieces.push_back("}");
   } else if (ShapeUtil::Rank(shape()) == 2) {
-    pieces.push_back(shape_to_string(shape()));
+    pieces.push_back(ShapeUtil::HumanString(shape()));
     pieces.push_back(" {\n");
     for (int64 i0 = 0; i0 < shape().dimensions(0); ++i0) {
       pieces.push_back("  { ");
@@ -649,7 +613,7 @@ string Literal::ToString(bool print_layout) const {
     }
     pieces.push_back("}");
   } else if (ShapeUtil::Rank(shape()) == 3) {
-    pieces.push_back(shape_to_string(shape()));
+    pieces.push_back(ShapeUtil::HumanString(shape()));
     pieces.push_back(" {\n");
     for (int64 i0 = 0; i0 < shape().dimensions(0); ++i0) {
       pieces.push_back(i0 > 0 ? ",\n{" : "{");
@@ -664,7 +628,7 @@ string Literal::ToString(bool print_layout) const {
     }
     pieces.push_back("\n}");
   } else if (ShapeUtil::Rank(shape()) == 4) {
-    pieces.push_back(shape_to_string(shape()));
+    pieces.push_back(ShapeUtil::HumanString(shape()));
     pieces.push_back(" {\n");
     for (int64 i0 = 0; i0 < shape().dimensions(0); ++i0) {
       pieces.push_back(tensorflow::strings::Printf("  {  /*i0=%lld*/\n", i0));
@@ -685,7 +649,7 @@ string Literal::ToString(bool print_layout) const {
     }
     pieces.push_back("}");
   } else if (ShapeUtil::Rank(shape()) == 5) {
-    pieces.push_back(shape_to_string(shape()));
+    pieces.push_back(ShapeUtil::HumanString(shape()));
     pieces.push_back(" {\n");
     for (int64 i0 = 0; i0 < shape().dimensions(0); ++i0) {
       pieces.push_back(tensorflow::strings::Printf("  {  /*i0=%lld*/\n", i0));
@@ -712,7 +676,7 @@ string Literal::ToString(bool print_layout) const {
     }
     pieces.push_back("}");
   } else {
-    pieces.push_back(shape_to_string(shape()));
+    pieces.push_back(ShapeUtil::HumanString(shape()));
     pieces.push_back(" {...}");
   }
 
@@ -771,8 +735,6 @@ void* Literal::MutableInternalData() {
       return reinterpret_cast<void*>(c64s_.data());
     case F16:
       return reinterpret_cast<void*>(f16s_.data());
-    case BF16:
-      return reinterpret_cast<void*>(bf16s_.data());
     default:
       LOG(FATAL) << "primitive type not supported in literals: "
                  << PrimitiveType_Name(shape().element_type());
@@ -815,9 +777,6 @@ void Literal::Reserve(int64 num_elements) {
     case F16:
       Resize<half>(num_elements, static_cast<half>(0.0f));
       break;
-    case BF16:
-      Resize<bfloat16>(num_elements, static_cast<bfloat16>(0.0f));
-      break;
     default:
       LOG(FATAL) << "primitive type not supported in literals: "
                  << PrimitiveType_Name(shape().element_type());
@@ -857,9 +816,6 @@ tensorflow::Status Literal::ValidateLiteral() const {
     case F16:
       actual = f16s().size() / sizeof(half);
       break;
-    case BF16:
-      actual = bf16s().size();
-      break;
     default:
       return tensorflow::errors::Unimplemented(
           "unhandled element type for literal validation: " +
@@ -956,7 +912,6 @@ StatusOr<std::unique_ptr<Literal>> ConvertIfDestTypeMatches(
     CONVERT_IF_TYPES_MATCH(F16)
     CONVERT_IF_TYPES_MATCH(F32)
     CONVERT_IF_TYPES_MATCH(F64)
-    CONVERT_IF_TYPES_MATCH(BF16)
 #undef CONVERT_IF_TYPES_MATCH
     case C64:
       return ConvertToC64<primitive_src_type>(src_literal);
@@ -986,9 +941,8 @@ StatusOr<std::unique_ptr<Literal>> Literal::Convert(
     CONVERT_IF_DEST_TYPE_MATCHES(F16)
     CONVERT_IF_DEST_TYPE_MATCHES(F32)
     CONVERT_IF_DEST_TYPE_MATCHES(F64)
-    CONVERT_IF_DEST_TYPE_MATCHES(BF16)
 #undef CONVERT_IF_DEST_TYPE_MATCHES
-      // Other types are not yet supported.
+    // Other types are not yet supported.
     default:
       return InvalidArgument("Unimplemented: Convert from type %s to type %s",
                              PrimitiveType_Name(shape().element_type()).c_str(),
@@ -1057,8 +1011,6 @@ bool Literal::operator==(const Literal& other) const {
         return EqualElements<double>(*this, other, 0, &multi_index);
       case F16:
         return EqualElements<half>(*this, other, 0, &multi_index);
-      case BF16:
-        return EqualElements<bfloat16>(*this, other, 0, &multi_index);
       case C64:
         return EqualElements<complex64>(*this, other, 0, &multi_index);
       default:
@@ -1168,19 +1120,14 @@ tensorflow::gtl::MutableArraySlice<complex64> Literal::GetMutableArraySlice() {
 
 template <>
 tensorflow::gtl::MutableArraySlice<half> Literal::GetMutableArraySlice<half>() {
+  // TODO - there is an endianess problem here. fix it, or wait for uint16
+  //        support in protobuf
   auto values = mutable_f16s();
   return tensorflow::gtl::MutableArraySlice<half>(values->data(),
                                                   values->size());
 }
 
 template <>
-tensorflow::gtl::MutableArraySlice<bfloat16>
-Literal::GetMutableArraySlice<bfloat16>() {
-  auto values = mutable_bf16s();
-  return {values->data(), values->size()};
-}
-
-template <>
 tensorflow::gtl::ArraySlice<bool> Literal::GetArraySlice<bool>() const {
   CHECK_EQ(shape().element_type(), PRED);
   return tensorflow::gtl::ArraySlice<bool>(
@@ -1251,12 +1198,6 @@ tensorflow::gtl::ArraySlice<half> Literal::GetArraySlice<half>() const {
 }
 
 template <>
-tensorflow::gtl::ArraySlice<bfloat16> Literal::GetArraySlice<bfloat16>() const {
-  CHECK_EQ(shape().element_type(), BF16);
-  return {bf16s().data(), bf16s().size()};
-}
-
-template <>
 tensorflow::gtl::ArraySlice<complex64> Literal::GetArraySlice<complex64>()
     const {
   CHECK_EQ(shape().element_type(), C64);
@@ -1304,9 +1245,6 @@ bool Literal::IsAll(int8 value) const {
       return AllElementsEqualValue<double>(*this, value);
     case F16:
       return AllElementsEqualValue<half>(*this, static_cast<half>(value));
-    case BF16:
-      return AllElementsEqualValue<bfloat16>(*this,
-                                             static_cast<bfloat16>(value));
     case PRED:
       if (value == 0) {
         return AllElementsEqualValue<bool>(*this, false);
@@ -1328,9 +1266,6 @@ bool Literal::IsAllFloat(float value) const {
       return AllElementsEqualValue<double>(*this, value);
     case F16:
       return AllElementsEqualValue<half>(*this, static_cast<half>(value));
-    case BF16:
-      return AllElementsEqualValue<bfloat16>(*this,
-                                             static_cast<bfloat16>(value));
     default:
       return false;
   }
@@ -1367,8 +1302,6 @@ bool Literal::IsZero(tensorflow::gtl::ArraySlice<int64> indices) const {
       return Get<complex64>(indices) == complex64(0.0f, 0.0f);
     case F16:
       return Get<half>(indices) == static_cast<half>(0.0f);
-    case BF16:
-      return Get<bfloat16>(indices) == static_cast<bfloat16>(0.0f);
     case PRED:
       return Get<bool>(indices) == false;
     default:
@@ -1437,12 +1370,6 @@ void Literal::Resize<half>(int64 num_elements, half value) {
 }
 
 template <>
-void Literal::Resize<bfloat16>(int64 num_elements, bfloat16 value) {
-  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
-  mutable_bf16s()->resize(num_elements, value);
-}
-
-template <>
 void Literal::Resize<complex64>(int64 num_elements, complex64 value) {
   CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
   mutable_c64s()->resize(num_elements, value);
@@ -1490,19 +1417,6 @@ LiteralProto Literal::ToProto() const {
       *proto.mutable_f16s() =
           string(reinterpret_cast<const char*>(f16s_.data()),
                  f16s_.size() * sizeof(half));
-      if (!kLittleEndian) {
-        ConvertEndianShort(const_cast<char*>(proto.mutable_f16s()->data()),
-                           proto.f16s().size());
-      }
-      break;
-    case BF16:
-      *proto.mutable_bf16s() =
-          string(reinterpret_cast<const char*>(bf16s_.data()),
-                 bf16s_.size() * sizeof(bfloat16));
-      if (!kLittleEndian) {
-        ConvertEndianShort(const_cast<char*>(proto.mutable_bf16s()->data()),
-                           proto.bf16s().size());
-      }
       break;
     case F32:
       CopyToRepeatedField(proto.mutable_f32s(), f32s());
@@ -1571,21 +1485,6 @@ void Literal::CopyFromProto(const LiteralProto& literal_proto) {
       CHECK_EQ(0, s.size() % sizeof(half));
       f16s_ = std::vector<half>(s.size() / sizeof(half));
       memcpy(f16s_.data(), s.data(), s.size());
-
-      if (!kLittleEndian) {
-        ConvertEndianShort(reinterpret_cast<char*>(f16s_.data()), s.size());
-      }
-      break;
-    }
-    case BF16: {
-      const string& s(literal_proto.bf16s());
-      CHECK_EQ(0, s.size() % sizeof(bfloat16));
-      bf16s_ = std::vector<bfloat16>(s.size() / sizeof(bfloat16));
-      memcpy(bf16s_.data(), s.data(), s.size());
-
-      if (!kLittleEndian) {
-        ConvertEndianShort(reinterpret_cast<char*>(bf16s_.data()), s.size());
-      }
       break;
     }
     case F32:
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index f37e529caf..a1e288829f 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -163,11 +163,6 @@ class Literal {
   const std::vector<complex64>& c64s() const { return c64s_; }
   std::vector<complex64>* mutable_c64s() { return &c64s_; }
 
-  int bf16s_size() const { return bf16s().size(); }
-  bfloat16 bf16s(int i) const { return bf16s_[i]; }
-  const std::vector<bfloat16>& bf16s() const { return bf16s_; }
-  std::vector<bfloat16>* mutable_bf16s() { return &bf16s_; }
-
   int tuple_literals_size() const { return tuple_literals().size(); }
   const Literal& tuple_literals(int i) const { return tuple_literals_[i]; }
   Literal* add_tuple_literals() {
@@ -455,7 +450,7 @@ class Literal {
   tensorflow::Status ValidateLiteral() const;
 
   // Returns a string representation of the literal value.
-  string ToString(bool print_layout = false) const;
+  string ToString() const;
 
   // Invokes the "per cell" callback for each element in the provided
   // literal with the element's indices and a string representation of
@@ -627,7 +622,6 @@ class Literal {
   std::vector<uint16> u16s_;
   std::vector<uint32> u32s_;
   std::vector<uint64> u64s_;
-  std::vector<bfloat16> bf16s_;
   std::vector<half> f16s_;
   std::vector<float> f32s_;
   std::vector<double> f64s_;
@@ -681,9 +675,6 @@ template <>
 tensorflow::gtl::ArraySlice<half> Literal::GetArraySlice<half>() const;
 
 template <>
-tensorflow::gtl::ArraySlice<bfloat16> Literal::GetArraySlice<bfloat16>() const;
-
-template <>
 tensorflow::gtl::ArraySlice<complex64> Literal::GetArraySlice<complex64>()
     const;
 
@@ -724,9 +715,6 @@ template <>
 tensorflow::gtl::MutableArraySlice<half> Literal::GetMutableArraySlice();
 
 template <>
-tensorflow::gtl::MutableArraySlice<bfloat16> Literal::GetMutableArraySlice();
-
-template <>
 tensorflow::gtl::MutableArraySlice<complex64> Literal::GetMutableArraySlice();
 
 template <>
@@ -760,9 +748,6 @@ template <>
 void Literal::Resize<half>(int64 num_elements, half value);
 
 template <>
-void Literal::Resize<bfloat16>(int64 num_elements, bfloat16 value);
-
-template <>
 void Literal::Resize<complex64>(int64 num_elements, complex64 value);
 
 template <typename NativeT>
@@ -1005,14 +990,6 @@ inline half Literal::Get<half>(
   return GetArraySlice<half>()[linear_index];
 }
 
-template <>
-inline bfloat16 Literal::Get<bfloat16>(
-    tensorflow::gtl::ArraySlice<int64> multi_index) const {
-  CHECK(shape().element_type() == BF16);
-  int64 linear_index = LinearIndex(multi_index);
-  return GetArraySlice<bfloat16>()[linear_index];
-}
-
 template <typename NativeT>
 void Literal::Set(tensorflow::gtl::ArraySlice<int64> multi_index,
                   NativeT value) {
diff --git a/tensorflow/compiler/xla/literal_util_test.cc b/tensorflow/compiler/xla/literal_util_test.cc
index 1e08101759..6d596da4ad 100644
--- a/tensorflow/compiler/xla/literal_util_test.cc
+++ b/tensorflow/compiler/xla/literal_util_test.cc
@@ -110,18 +110,6 @@ TEST_F(LiteralUtilTest, LiteralScalarToString) {
 
   auto c64_lit = Literal::CreateR0<complex64>({3.14f, 2.78f});
   ASSERT_EQ("(3.14, 2.78)", c64_lit->ToString());
-
-  auto bf16_lit = Literal::CreateR0<bfloat16>(static_cast<bfloat16>(0.5f));
-  ASSERT_EQ("0.5", bf16_lit->ToString());
-
-  // 3.14 will be rounded to 3.125 in bfloat16 format (Round to nearest even).
-  auto bf16_lit_truncated =
-      Literal::CreateR0<bfloat16>(static_cast<bfloat16>(3.14f));
-  ASSERT_EQ("3.140625", bf16_lit_truncated->ToString());
-
-  auto bf16_lit_truncated2 =
-      Literal::CreateR0<bfloat16>(static_cast<bfloat16>(9.001f));
-  ASSERT_EQ("9", bf16_lit_truncated2->ToString());
 }
 
 TEST_F(LiteralUtilTest, LiteralVectorToString) {
@@ -409,18 +397,6 @@ TEST_F(LiteralUtilTest, IsAll) {
   EXPECT_FALSE(Literal::CreateR2<half>({{h8}, {h9}})->IsAll(8));
   EXPECT_FALSE(Literal::CreateR2<half>({{h9}, {h8}})->IsAll(8));
 
-  bfloat16 b8(8.0f);
-  bfloat16 b9(9.0f);
-
-  EXPECT_TRUE(Literal::CreateR2<bfloat16>({{b8}, {b8}})->IsAll(8));
-  EXPECT_FALSE(Literal::CreateR2<bfloat16>({{b8}, {b9}})->IsAll(8));
-  EXPECT_FALSE(Literal::CreateR2<bfloat16>({{b9}, {b8}})->IsAll(8));
-
-  // 9.001 will be truncated to 9.0
-  bfloat16 b91(9.001f);
-  bfloat16 b90(9.00f);
-  EXPECT_TRUE(Literal::CreateR2<bfloat16>({{b91}, {b90}})->IsAll(9.0));
-
   complex64 c8_9 = {8, 9};
   EXPECT_FALSE(Literal::CreateR2<complex64>({{c8_9}, {c8_9}})->IsAll(8));
 
@@ -715,30 +691,6 @@ TEST_F(LiteralUtilTest, PopulateR2C64) {
   EXPECT_EQ(output, *expected);
 }
 
-TEST_F(LiteralUtilTest, PopulateWithValueR0BF16) {
-  Literal output;
-  bfloat16 h(0.25f);
-  output.PopulateWithValue<bfloat16>(h, {});
-  auto expected = Literal::CreateR0<bfloat16>(h);
-  EXPECT_EQ(output, *expected);
-}
-
-TEST_F(LiteralUtilTest, PopulateWithValueR1BF16) {
-  Literal output;
-  bfloat16 h(0.5f);
-  output.PopulateWithValue<bfloat16>(h, {3});
-  auto expected = Literal::CreateR1<bfloat16>({h, h, h});
-  EXPECT_EQ(output, *expected);
-}
-
-TEST_F(LiteralUtilTest, PopulateWithValueR2BF16) {
-  Literal output;
-  bfloat16 h(2.0f);
-  output.PopulateWithValue<bfloat16>(h, {2, 2});
-  auto expected = Literal::CreateR2<bfloat16>({{h, h}, {h, h}});
-  EXPECT_EQ(output, *expected);
-}
-
 TEST_F(LiteralUtilTest, PopulateWithValueR0F32) {
   Literal output;
   output.PopulateWithValue<float>(2.5f, {});
@@ -1023,14 +975,6 @@ TEST_F(LiteralUtilTest, ConvertIfTypesMatch) {
     {{half(26.0), half(0.0), half(28.0), half(0.0)},
      {half(0.0), half(31.0), half(0.0), half(33.0)}},
   }}, layout_r4_dim0major_);
-  auto bf16 = Literal::CreateR4WithLayout<bfloat16>({{
-    {{bfloat16(10.0), bfloat16(0.0), bfloat16(12.0), bfloat16(0.0)},
-     {bfloat16(0.0), bfloat16(15.0), bfloat16(0.0), bfloat16(17.0)}},
-    {{bfloat16(0.0), bfloat16(19.0), bfloat16(0.0), bfloat16(21.0)},
-     {bfloat16(22.0), bfloat16(0.0), bfloat16(24.0), bfloat16(0.0)}},
-    {{bfloat16(26.0), bfloat16(0.0), bfloat16(28.0), bfloat16(0.0)},
-     {bfloat16(0.0), bfloat16(31.0), bfloat16(0.0), bfloat16(33.0)}},
-  }}, layout_r4_dim0major_);
   auto f32 = Literal::CreateR4WithLayout<float>({{
     {{10.0f, 0.0f, 12.0f, 0.0f}, {0.0f, 15.0f, 0.0f, 17.0f}},
     {{0.0f, 19.0f, 0.0f, 21.0f}, {22.0f, 0.0f, 24.0f, 0.0f}},
@@ -1064,12 +1008,6 @@ TEST_F(LiteralUtilTest, ConvertIfTypesMatch) {
   conv = s8->Convert(PRED).ConsumeValueOrDie();
   EXPECT_EQ(*conv, *pred);
 
-  conv = bf16->Convert(S32).ConsumeValueOrDie();
-  EXPECT_EQ(*conv, *s32);
-
-  conv = bf16->Convert(F32).ConsumeValueOrDie();
-  EXPECT_EQ(*conv, *f32);
-
   conv = pred->Convert(S32).ConsumeValueOrDie();
   EXPECT_EQ(*conv, *int32_pred);
 
diff --git a/tensorflow/compiler/xla/primitive_util.cc b/tensorflow/compiler/xla/primitive_util.cc
index 2bce56b7bd..2113b5e06f 100644
--- a/tensorflow/compiler/xla/primitive_util.cc
+++ b/tensorflow/compiler/xla/primitive_util.cc
@@ -79,11 +79,6 @@ PrimitiveType NativeToPrimitiveType<double>() {
 }
 
 template <>
-PrimitiveType NativeToPrimitiveType<bfloat16>() {
-  return BF16;
-}
-
-template <>
 PrimitiveType NativeToPrimitiveType<half>() {
   return F16;
 }
@@ -94,7 +89,7 @@ PrimitiveType NativeToPrimitiveType<complex64>() {
 }
 
 bool IsFloatingPointType(PrimitiveType type) {
-  return type == F16 || type == F32 || type == F64 || type == BF16;
+  return type == F16 || type == F32 || type == F64;
 }
 
 bool IsComplexType(PrimitiveType type) { return type == C64; }
@@ -123,7 +118,6 @@ int BitWidth(PrimitiveType type) {
     case S16:
     case U16:
     case F16:
-    case BF16:
       return 16;
 
     case U32:
diff --git a/tensorflow/compiler/xla/primitive_util.h b/tensorflow/compiler/xla/primitive_util.h
index 19c6a13888..a49c8b86fc 100644
--- a/tensorflow/compiler/xla/primitive_util.h
+++ b/tensorflow/compiler/xla/primitive_util.h
@@ -77,8 +77,6 @@ template <>
 PrimitiveType NativeToPrimitiveType<double>();
 template <>
 PrimitiveType NativeToPrimitiveType<half>();
-template <>
-PrimitiveType NativeToPrimitiveType<bfloat16>();
 
 // Complex
 template <>
@@ -169,11 +167,6 @@ struct PrimitiveTypeToNative<F16> {
   using type = half;
 };
 
-template <>
-struct PrimitiveTypeToNative<BF16> {
-  using type = bfloat16;
-};
-
 // Complex
 template <>
 struct PrimitiveTypeToNative<C64> {
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 7cf24641b5..521fe411a4 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -90,8 +90,6 @@ cc_library(
         ":shape_inference",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status",
-        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
@@ -1780,6 +1778,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
     ],
 )
@@ -1850,6 +1849,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index 05f2d06278..9abe30e3f3 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#define EIGEN_USE_THREADS
-
 #include "tensorflow/compiler/xla/service/backend.h"
 
 #include <algorithm>
 #include <string>
 #include <utility>
 
+#define EIGEN_USE_THREADS
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 3c5b360c8e..b422b22df9 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -497,19 +497,19 @@ Status GatherComputationsByAllocationType(
     std::vector<const HloComputation*>* global_computations) {
   // Create a worklist of computations paired with whether the allocation must
   // be thread-local.
-  std::deque<std::pair<const HloComputation*, bool>> worklist;
+  std::deque<std::pair<HloComputation*, bool>> worklist;
   worklist.push_back(std::make_pair(module->entry_computation(),
                                     /*is_thread_local*/ false));
 
   // Sets for quickly checking membership. Computations are returned in vectors
   // for stable iteration.
-  FlatSet<const HloComputation*> thread_local_set;
-  FlatSet<const HloComputation*> global_set;
+  FlatSet<HloComputation*> thread_local_set;
+  FlatSet<HloComputation*> global_set;
 
   while (!worklist.empty()) {
     auto worklist_front = worklist.front();
     worklist.pop_front();
-    const HloComputation* computation = worklist_front.first;
+    HloComputation* computation = worklist_front.first;
     bool is_thread_local = worklist_front.second;
     bool in_thread_local_set = thread_local_set.count(computation) > 0;
     bool in_global_set = global_set.count(computation) > 0;
@@ -653,7 +653,7 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
   }
 
   if (allow_input_output_aliasing_ && allocation->maybe_live_out()) {
-    const HloComputation* entry_computation =
+    HloComputation* entry_computation =
         assignment->module_->entry_computation();
     for (auto param : entry_computation->parameter_instructions()) {
       for (auto& param_buffer :
@@ -819,6 +819,17 @@ Status BufferAssigner::AssignBuffersForComputation(
       continue;
     }
 
+    if (instruction->opcode() == HloOpcode::kRecv) {
+      // Make sure that recv operations get a new unique allocation so that
+      // don't share their buffer with any other operations.
+      BufferAllocation* allocation = assignment->NewAllocation(
+          *buffer, buffer_size, is_thread_local, /*is_reusable=*/false);
+      allocation_indices.push_back(allocation->index());
+      VLOG(3) << "New allocation #" << allocation->index()
+              << " for recv: " << *buffer;
+      continue;
+    }
+
     if (ShapeUtil::IsTuple(buffer->shape())) {
       // TODO(b/34669761): Don't reuse tuple buffers because the GPU backend
       // assumes longer buffer liveness than indicated by the analysis.
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 4f6e69ebd4..6213baee2f 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -280,7 +280,6 @@ cc_library(
     srcs = ["dot_op_emitter.cc"],
     hdrs = ["dot_op_emitter.h"],
     deps = [
-        ":cpu_options",
         ":cpu_runtime",
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:shape_util",
@@ -291,10 +290,8 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service/llvm_ir:ir_array",
-        "//tensorflow/compiler/xla/service/llvm_ir:kernel_support_library",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
-        "//tensorflow/compiler/xla/service/llvm_ir:vector_support_library",
         "//tensorflow/core:lib",
         "@llvm//:core",
     ],
@@ -720,7 +717,6 @@ cc_library(
     hdrs = ["cpu_options.h"],
     deps = [
         "//tensorflow/compiler/xla/service:hlo_module_config",
-        "//tensorflow/core:lib",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
index 09f028463a..dba140d112 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
@@ -15,14 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
 
-#include "tensorflow/core/lib/strings/numbers.h"
-
 namespace {
 
 const char* const kXlaParallelCpuOption = "xla_cpu_parallel";
 const char* const kXlaOptimizeForSizeCpuOption = "xla_cpu_optimize_for_size";
 const char* const kXlaDisableVectorizedReduce = "xla_disable_vectorized_reduce";
-const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor";
 
 }  // namespace
 
@@ -48,19 +45,6 @@ bool VectorizedReduceDisabled(const HloModuleConfig& config) {
   return extra_options_map.count(kXlaOptimizeForSizeCpuOption) > 0;
 }
 
-tensorflow::gtl::optional<int64> LlvmIrGemvTilingFactor(
-    const HloModuleConfig& config) {
-  const auto& extra_options_map =
-      config.debug_options().xla_backend_extra_options();
-  auto it = extra_options_map.find(kLlvmIrDotTilingFactor);
-  int64 tiling_factor;
-  if (it != extra_options_map.end() &&
-      tensorflow::strings::safe_strto64(it->second, &tiling_factor)) {
-    return tiling_factor;
-  }
-  return tensorflow::gtl::nullopt;
-}
-
 }  // namespace options
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.h b/tensorflow/compiler/xla/service/cpu/cpu_options.h
index 6ba0fd2453..5dc24ebc7b 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.h
@@ -27,8 +27,6 @@ namespace options {
 bool CpuParallelBackendRequested(const HloModuleConfig& config);
 bool OptimizeForSizeRequested(const HloModuleConfig& config);
 bool VectorizedReduceDisabled(const HloModuleConfig& config);
-tensorflow::gtl::optional<int64> LlvmIrGemvTilingFactor(
-    const HloModuleConfig& config);
 
 }  // namespace options
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
index f385829cdf..f8e260dd90 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
@@ -12,13 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#define EIGEN_USE_THREADS
+
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 
 #include <memory>
 #include <string>
 #include <tuple>
 
+#define EIGEN_USE_THREADS
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 2a447a54b0..e57d49172b 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -25,9 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
-#include "tensorflow/compiler/xla/service/llvm_ir/vector_support_library.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -40,450 +38,6 @@ using llvm_ir::SetToFirstInsertPoint;
 
 namespace cpu {
 
-namespace {
-// Loads a tile of values from a 2D tensor.
-class TileLoader {
- public:
-  // Constructs a TileLoader that will load a tile consisting of
-  // `tile_size_along_major_dim` vectors from the matrix `matrix`, starting at
-  // `major_dim_offset` in the major dimension.  The tile size along the minor
-  // dimension is the vector size, and that is implicitly determined by `vsl`.
-  TileLoader(VectorSupportLibrary* vsl, llvm::IRBuilder<>* ir_builder,
-             llvm::Value* matrix, int64 matrix_size_along_minor_dim,
-             llvm::Value* major_dim_offset, int64 tile_size_along_major_dim)
-      : vsl_(vsl) {
-    pointers_.reserve(tile_size_along_major_dim);
-    for (int64 i = 0; i < tile_size_along_major_dim; i++) {
-      llvm::Value* total_offset = ir_builder->CreateMul(
-          ir_builder->getInt64(matrix_size_along_minor_dim),
-          ir_builder->CreateAdd(ir_builder->getInt64(i), major_dim_offset));
-      pointers_.push_back(vsl_->ComputeOffsetPointer(matrix, total_offset));
-    }
-  }
-
-  // Load a tile consisting of `tile_size_along_major_dim_` vectors starting at
-  // `major_dim_offset_` in the major dimension and `minor_dim_offset` in the
-  // minor dimension.
-  std::vector<llvm::Value*> LoadTile(llvm::Value* minor_dim_offset) const {
-    std::vector<llvm::Value*> result;
-    result.reserve(pointers_.size());
-    for (const auto& pointer : pointers_) {
-      result.push_back(vsl_->LoadVector(pointer, minor_dim_offset));
-    }
-    return result;
-  }
-
- private:
-  VectorSupportLibrary* vsl_;
-  std::vector<llvm::Value*> pointers_;
-};
-
-// Computes a dot product between "[M,K]{0,1} lhs" with a [K,1] vector (the
-// layout of the vector does not matter).  This implementation uses a tiling
-// scheme to improve performance.
-//
-// We logically separate the LHS matrix into four segments:
-//
-//   +----------------------+---+
-//   |                      |   |
-//   |                      |   |
-//   |         A            | B |
-//   |                      |   |
-//   |                      |   |
-//   |                      |   |
-//   +----------------------+---+
-//   |         C            | D |
-//   +----------------------+---+
-//
-// where A is the largest submatrix of the LHS that can be evenly dividied into
-// tiles.  For each tile in A, assuming tile_rows_ == tile_cols_ == 4, we have:
-//
-//   +---+---+---+---+       +--+--+--+--+
-//   |M00|M10|M20|M30|       |V0|V1|V2|V3|
-//   +---+---+---+---+       +--+--+--+--+
-//   |M01|M11|M21|M31| and   |V0|V1|V2|V3|
-//   +---+---+---+---+       +--+--+--+--+
-//   |M02|M12|M22|M32|       |V0|V1|V2|V3|
-//   +---+---+---+---+       +--+--+--+--+
-//   |M03|M13|M23|M33|       |V0|V1|V2|V3|
-//   +---+---+---+---+       +--+--+--+--+
-//
-// (Legend: rows are horizontal and columns are vertical; and each column is one
-// llvm::Value of a vector type)
-//
-// where:
-//
-//   a. The left tile is from the column major left matrix.
-//   b. The right tile is an elementwise broadcast of a [V0, V1, V2, V3]
-//      vector loaded from the RHS vector.
-//
-// As we iterate through the column dimension, we compute the change to the
-// result vector by an elementwise multiplication between the two tiles above
-// followed by a reduction along the major dimension:
-//
-//                     +-----------------------------------+
-//                     | M00*V0 + M10*V1 + M20*V2 + M30*V3 |
-//                     +-----------------------------------+
-//                     | M01*V0 + M11*V1 + M21*V2 + M31*V3 |
-// Result[R:R+4] +=    +-----------------------------------+
-//                     | M02*V0 + M12*V1 + M22*V2 + M32*V3 |
-//                     +-----------------------------------+
-//                     | M03*V0 + M13*V1 + M23*V2 + M33*V3 |
-//                     +-----------------------------------+
-//
-// Where R is the starting row for the tile.
-//
-// We have an inner epilogue loop to deal with the "C" submatrix and an outer
-// epilogue loop to deal with the B,D submarix.
-//
-// TODO(sanjoy): We should investigate if using gather loads and scatter stores
-// can be used here have the same inner loop for both column-major and row-major
-// matrix-vector products.
-class ColumnMajorMatrixVectorProductEmitter {
- public:
-  ColumnMajorMatrixVectorProductEmitter(PrimitiveType scalar_type,
-                                        int64 tile_rows, int64 tile_cols,
-                                        int64 m, int64 k, llvm::Value* lhs,
-                                        llvm::Value* rhs, llvm::Value* result,
-                                        llvm::IRBuilder<>* ir_builder)
-      : scalar_type_(scalar_type),
-        tile_rows_(tile_rows),
-        tile_cols_(tile_cols),
-        m_(m),
-        k_(k),
-        lhs_(lhs),
-        rhs_(rhs),
-        result_(result),
-        ir_builder_(ir_builder),
-        ksl_(ir_builder_),
-        vsl_(scalar_type_, /*vector_size=*/tile_rows_, ir_builder_, "") {
-    CHECK(tile_rows_ > 0 && IsPowerOfTwo(static_cast<uint64>(tile_rows_)));
-  }
-
-  void Emit();
-
- private:
-  void EmitOuterLoopBody(llvm::Value* column, int64 column_count,
-                         bool is_first_column);
-
-  TileLoader GetLhsTileLoader(llvm::Value* column_start, int64 column_count) {
-    return TileLoader(&vsl_, ir_builder_, /*matrix=*/lhs_,
-                      /*matrix_size_along_minor_dim=*/m_,
-                      /*major_dim_offset=*/column_start,
-                      /*tile_size_along_major_dim=*/column_count);
-  }
-
-  // Load a tile of values from the RHS.  For the RHS a "tile" is a contiguous
-  // sequnce of `count` values, each one broadcasted to the vector width.
-  std::vector<llvm::Value*> LoadRhsTile(llvm::Value* offset, int64 count) {
-    llvm::Value* base_pointer = vsl_.ComputeOffsetPointer(rhs_, offset);
-    std::vector<llvm::Value*> result;
-    result.reserve(count);
-    for (int64 i = 0; i < count; i++) {
-      result.push_back(vsl_.LoadBroadcast(base_pointer, i));
-    }
-    return result;
-  }
-
-  void EmitInnerLoopTiled(TileLoader* lhs_tile_loader,
-                          const std::vector<llvm::Value*>& rhs_tile,
-                          int64 columns, bool is_first_column);
-
-  void EmitInnerLoopEpilogue(llvm::Value* current_tile_col, int64 columns,
-                             bool is_first_tiled_column);
-
-  PrimitiveType scalar_type_;
-  int64 tile_rows_;
-  int64 tile_cols_;
-  int64 m_;
-  int64 k_;
-  llvm::Value* lhs_;
-  llvm::Value* rhs_;
-  llvm::Value* result_;
-  llvm::IRBuilder<>* ir_builder_;
-  KernelSupportLibrary ksl_;
-  VectorSupportLibrary vsl_;
-};
-
-void ColumnMajorMatrixVectorProductEmitter::EmitOuterLoopBody(
-    llvm::Value* column, int64 column_count, bool is_first_column) {
-  TileLoader lhs_tile_loader = GetLhsTileLoader(/*column_start=*/column,
-                                                /*column_count=*/column_count);
-
-  std::vector<llvm::Value*> rhs_tile =
-      LoadRhsTile(column, /*count=*/column_count);
-  EmitInnerLoopTiled(&lhs_tile_loader, rhs_tile,
-                     /*columns=*/column_count, is_first_column);
-  EmitInnerLoopEpilogue(column, /*columns=*/column_count, is_first_column);
-}
-
-void ColumnMajorMatrixVectorProductEmitter::Emit() {
-  // See the comment on the class declaration for the algorithm used here.
-  int64 column_remainder = k_ % tile_cols_;
-  int64 column_limit = k_ - column_remainder;
-
-  ksl_.For("dot.outer.tiled",
-           /*start=*/0, /*end=*/column_limit, /*step=*/tile_cols_,
-           [&](llvm::Value* column, bool is_first_column) {
-             EmitOuterLoopBody(column, tile_cols_, is_first_column);
-           });
-
-  if (column_remainder != 0) {
-    EmitOuterLoopBody(ir_builder_->getInt64(column_limit), column_remainder,
-                      column_limit == 0);
-  }
-}
-
-void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
-    TileLoader* lhs_tile_loader, const std::vector<llvm::Value*>& rhs_tile,
-    int64 columns, bool is_first_column) {
-  int64 row_limit = m_ - (m_ % tile_rows_);
-
-  ksl_.For("dot.inner.tiled", /*start=*/0, /*end=*/row_limit,
-           /*step=*/tile_rows_, [&](llvm::Value* row) {
-             std::vector<llvm::Value*> lhs_tile =
-                 lhs_tile_loader->LoadTile(/*minor_dim_offset=*/row);
-             llvm::Value* accumulator = is_first_column
-                                            ? vsl_.GetZeroVector()
-                                            : vsl_.LoadVector(result_, row);
-             for (int i = 0; i < columns; i++) {
-               accumulator = vsl_.MulAdd(lhs_tile[i], rhs_tile[i], accumulator);
-             }
-             vsl_.StoreVector(accumulator, result_, row);
-           });
-}
-
-void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
-    llvm::Value* current_tile_col, int64 columns, bool is_first_tiled_column) {
-  int64 row_start = m_ - (m_ % tile_rows_);
-  if (row_start == m_) {
-    return;
-  }
-
-  llvm::Value* columns_llvm = ir_builder_->getInt64(columns);
-
-  // for (col = current_tile_col; col < (columns + current_tile_col); col++)
-  //   for (row = row_start, row < m_; row++) {
-  //     result[row] += lhs[row, col] * rhs[col]
-  //     // Also take into account that if col is 0 then result[row] is not
-  //     // initialized.
-  //   }
-
-  ksl_.For(
-      "dot.inner.epilg.outer", /*start=*/current_tile_col,
-      /*end=*/ir_builder_->CreateAdd(columns_llvm, current_tile_col),
-      /*step=*/1, /*peel_first_iteration=*/false,
-      [&](llvm::Value* col, llvm::Value* is_first_scalar_col) {
-        llvm::Value* rhs_element = vsl_.LoadScalar(rhs_, col);
-        llvm::Value* total_offset =
-            ir_builder_->CreateMul(col, ir_builder_->getInt64(m_));
-        llvm::Value* lhs_base_pointer =
-            vsl_.ComputeOffsetPointer(lhs_, total_offset);
-        ksl_.For(
-            "dot.inner.epilg.inner", /*start=*/row_start, /*end=*/m_,
-            /*step=*/1, [&](llvm::Value* scalar_row) {
-              llvm::Value* product = vsl_.Mul(
-                  vsl_.LoadScalar(lhs_base_pointer, scalar_row), rhs_element);
-              llvm::Value* setting_result_first_time = ir_builder_->CreateAnd(
-                  is_first_scalar_col,
-                  ir_builder_->getInt1(is_first_tiled_column));
-              ksl_.If(
-                  setting_result_first_time,
-                  [&]() { vsl_.StoreScalar(product, result_, scalar_row); },
-                  [&]() {
-                    vsl_.StoreScalar(
-                        vsl_.Add(vsl_.LoadScalar(result_, scalar_row), product),
-                        result_, scalar_row);
-                  });
-            });
-      });
-}
-
-// Computes a dot product between "[M,K]{1,0} lhs" with a [K,1] vector (the
-// layout of the vector does not matter).  This implementation uses a tiling
-// scheme to improve performance.
-//
-// We logically separate the LHS matrix into four segments:
-//
-//   +----------------------+---+
-//   |                      |   |
-//   |                      |   |
-//   |         A            | B |
-//   |                      |   |
-//   |                      |   |
-//   |                      |   |
-//   +----------------------+---+
-//   |         C            | D |
-//   +----------------------+---+
-//
-// where A is the largest submatrix of the LHS that can be evenly dividied into
-// tiles.  For each tile in A, assuming tile_rows_ == tile_cols_ == 4, we have:
-//
-//   +---+---+---+---+
-//   |M00|M10|M20|M30|
-//   +---+---+---+---+       +--+--+--+--+
-//   |M01|M11|M21|M31| and   |V0|V1|V2|V3|
-//   +---+---+---+---+       +--+--+--+--+
-//   |M02|M12|M22|M32|
-//   +---+---+---+---+
-//   |M03|M13|M23|M33|
-//   +---+---+---+---+
-//
-// (Legend: rows are horizontal and columns are vertical; and each row is one
-// llvm::Value of a vector type)
-//
-// where:
-//
-//   a. The left tile is loaded from the row major left matrix.
-//   b. The right vector is loaded from the RHS vector.
-//
-// We keep 4 vector accumulators accumulating the following four vector
-// expressions as we iterate over the row dimension:
-//
-//   +------+------+------+------+
-//   |M0I*V0|M1I*V1|M2I*V2|M3I*V3|  for I in [0,4)
-//   +------+------+------+------+
-//
-// In the end we do a horizontal reduction over these 4 vector accumulators to
-// get 4 values in the result vector.
-//
-// We have an inner epilogue loop to deal with the "B" sub-matrix and an outer
-// epilogue loop to deal with the C,D submatrix.
-class RowMajorMatrixVectorProductEmitter {
- public:
-  RowMajorMatrixVectorProductEmitter(PrimitiveType scalar_type, int64 tile_rows,
-                                     int64 tile_cols, int64 m, int64 k,
-                                     llvm::Value* lhs, llvm::Value* rhs,
-                                     llvm::Value* result,
-                                     llvm::IRBuilder<>* ir_builder)
-      : scalar_type_(scalar_type),
-        tile_rows_(tile_rows),
-        tile_cols_(tile_cols),
-        m_(m),
-        k_(k),
-        lhs_(lhs),
-        rhs_(rhs),
-        result_(result),
-        ir_builder_(ir_builder),
-        ksl_(ir_builder_),
-        vsl_(scalar_type_, /*vector_size=*/tile_cols_, ir_builder_, "") {
-    CHECK(tile_cols_ > 0 && IsPowerOfTwo(static_cast<uint64>(tile_cols_)));
-  }
-
-  void Emit();
-
- private:
-  TileLoader GetLhsTileLoader(llvm::Value* row_start, int64 row_count) {
-    return TileLoader(&vsl_, ir_builder_, /*matrix=*/lhs_,
-                      /*matrix_size_along_minor_dim=*/k_,
-                      /*major_dim_offset=*/row_start,
-                      /*tile_size_along_major_dim=*/row_count);
-  }
-
-  void EmitOuterLoopBody(llvm::Value* row, int64 row_count);
-
-  void EmitInnerLoopTiled(TileLoader* lhs_tile_loader, int64 rows,
-                          std::vector<VectorVariable>* vector_accumulators);
-
-  void EmitInnerLoopEpilogue(llvm::Value* current_tile_row, int64 rows,
-                             std::vector<ScalarVariable>* scalar_accumulators);
-
-  PrimitiveType scalar_type_;
-  int64 tile_rows_;
-  int64 tile_cols_;
-  int64 m_;
-  int64 k_;
-  llvm::Value* lhs_;
-  llvm::Value* rhs_;
-  llvm::Value* result_;
-  llvm::IRBuilder<>* ir_builder_;
-  KernelSupportLibrary ksl_;
-  VectorSupportLibrary vsl_;
-};
-
-void RowMajorMatrixVectorProductEmitter::EmitOuterLoopBody(llvm::Value* row,
-                                                           int64 row_count) {
-  TileLoader lhs_tile_loader = GetLhsTileLoader(/*row_start=*/row,
-                                                /*row_count=*/row_count);
-  std::vector<VectorVariable> vector_accumulators;
-  std::vector<ScalarVariable> scalar_accumulators;
-  for (int i = 0; i < row_count; i++) {
-    vector_accumulators.emplace_back(&vsl_, vsl_.GetZeroVector());
-    scalar_accumulators.emplace_back(&vsl_, vsl_.GetZeroScalar());
-  }
-  EmitInnerLoopTiled(&lhs_tile_loader, /*rows=*/row_count,
-                     &vector_accumulators);
-  EmitInnerLoopEpilogue(/*current_tile_row=*/row, /*rows=*/row_count,
-                        &scalar_accumulators);
-
-  for (int i = 0; i < row_count; i++) {
-    llvm::Value* result_value =
-        vsl_.Add(vsl_.AddReduce(vector_accumulators[i].Get()),
-                 scalar_accumulators[i].Get());
-    llvm::Value* offset = ir_builder_->CreateAdd(ir_builder_->getInt64(i), row);
-    vsl_.StoreScalar(result_value, result_, offset);
-  }
-}
-
-void RowMajorMatrixVectorProductEmitter::Emit() {
-  // See the comment on the class declaration for the algorithm used here.
-  int64 row_remainder = m_ % tile_rows_;
-  int64 row_limit = m_ - row_remainder;
-
-  ksl_.For("dot.outer.tiled",
-           /*start=*/0, /*end=*/row_limit, /*step=*/tile_rows_,
-           [&](llvm::Value* row) { EmitOuterLoopBody(row, tile_rows_); });
-
-  if (row_remainder != 0) {
-    EmitOuterLoopBody(ir_builder_->getInt64(row_limit), row_remainder);
-  }
-}
-
-void RowMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
-    TileLoader* lhs_tile_loader, int64 rows,
-    std::vector<VectorVariable>* vector_accumulators) {
-  int64 column_limit = k_ - (k_ % tile_cols_);
-
-  ksl_.For("dot.inner.tiled", /*start=*/0, /*end=*/column_limit,
-           /*step=*/tile_cols_, [&](llvm::Value* col) {
-             std::vector<llvm::Value*> lhs_tile =
-                 lhs_tile_loader->LoadTile(/*minor_dim_offset=*/col);
-             llvm::Value* rhs_value = vsl_.LoadVector(rhs_, col);
-             for (int i = 0; i < rows; i++) {
-               llvm::Value* old_sum = (*vector_accumulators)[i].Get();
-               (*vector_accumulators)[i].Set(
-                   vsl_.Add(old_sum, vsl_.Mul(rhs_value, lhs_tile[i])));
-             }
-           });
-}
-
-void RowMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
-    llvm::Value* current_tile_row, int64 rows,
-    std::vector<ScalarVariable>* scalar_accumulators) {
-  int64 column_start = k_ - (k_ % tile_cols_);
-  if (column_start == k_) {
-    return;
-  }
-
-  for (int r = 0; r < rows; r++) {
-    llvm::Value* total_offset = ir_builder_->CreateMul(
-        ir_builder_->CreateAdd(ir_builder_->getInt64(r), current_tile_row),
-        ir_builder_->getInt64(k_));
-    llvm::Value* lhs_base_pointer =
-        vsl_.ComputeOffsetPointer(lhs_, total_offset);
-    ksl_.For("dot.inner.epilg.inner", /*start=*/column_start, /*end=*/k_,
-             /*step=*/1, [&](llvm::Value* scalar_col) {
-               llvm::Value* product =
-                   vsl_.Mul(vsl_.LoadScalar(lhs_base_pointer, scalar_col),
-                            vsl_.LoadScalar(rhs_, scalar_col));
-               llvm::Value* old_value = (*scalar_accumulators)[r].Get();
-               (*scalar_accumulators)[r].Set(vsl_.Add(old_value, product));
-             });
-  }
-}
-
-}  // namespace
-
 DotOpEmitter::DotOpEmitter(const HloInstruction& dot, bool transpose_lhs,
                            bool transpose_rhs,
                            const llvm_ir::IrArray& target_array,
@@ -518,93 +72,6 @@ DotOpEmitter::DotOpEmitter(const HloInstruction& dot, bool transpose_lhs,
 
 bool DotOpEmitter::ShapesAreLegalForRuntimeDot() const { return true; }
 
-bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
-  if (dot_.shape().dimensions_size() != 2 ||
-      ProfitableToImplementDotInUntiledLlvmIr(dot_) ==
-          DotInLlvmIrProfitable::kYes) {
-    return false;
-  }
-
-  if (!primitive_util::IsFloatingPointType(dot_.shape().element_type()) &&
-      !primitive_util::IsIntegralType(dot_.shape().element_type())) {
-    return false;
-  }
-
-  MatMultDims mat_mult_dims = GetMatMultDims();
-  bool is_column_major_matrix_vector = false;
-  bool is_row_major_matrix_vector = false;
-
-  int64 m, k;
-  bool swap_operands;
-
-  if (mat_mult_dims.m == 1) {
-    bool rhs_effectively_row_major =
-        transpose_rhs_ ^ !mat_mult_dims.rhs_column_major;
-    if (rhs_effectively_row_major) {
-      k = mat_mult_dims.k;
-      m = mat_mult_dims.n;
-      is_column_major_matrix_vector = true;
-      swap_operands = true;
-    } else {
-      k = mat_mult_dims.k;
-      m = mat_mult_dims.n;
-      is_row_major_matrix_vector = true;
-      swap_operands = true;
-    }
-  }
-
-  if (mat_mult_dims.n == 1) {
-    bool lhs_effectively_column_major =
-        transpose_lhs_ ^ mat_mult_dims.lhs_column_major;
-    if (lhs_effectively_column_major) {
-      m = mat_mult_dims.m;
-      k = mat_mult_dims.k;
-      is_column_major_matrix_vector = true;
-      swap_operands = false;
-    } else {
-      m = mat_mult_dims.m;
-      k = mat_mult_dims.k;
-      is_row_major_matrix_vector = true;
-      swap_operands = false;
-    }
-  }
-
-  if (!is_column_major_matrix_vector && !is_row_major_matrix_vector) {
-    return false;
-  }
-
-  int64 tiling_factor = GetGemvTilingFactor();
-  CHECK_GT(tiling_factor, 0);
-
-  if (is_column_major_matrix_vector) {
-    VLOG(2) << "Emitting column major matrix-vector multiply with m = " << m
-            << " and k = " << k;
-    ColumnMajorMatrixVectorProductEmitter emitter(
-        dot_.shape().element_type(), /*tile_rows=*/8,
-        /*tile_cols=*/tiling_factor, m, k,
-        swap_operands ? rhs_array_.GetBasePointer()
-                      : lhs_array_.GetBasePointer(),
-        swap_operands ? lhs_array_.GetBasePointer()
-                      : rhs_array_.GetBasePointer(),
-        target_array_.GetBasePointer(), ir_builder_);
-    emitter.Emit();
-  } else {
-    VLOG(2) << "Emitting row major matrix-vector multiply with m = " << m
-            << " and k = " << k;
-    RowMajorMatrixVectorProductEmitter emitter(
-        dot_.shape().element_type(), /*tile_rows=*/tiling_factor,
-        /*tile_cols=*/8, m, k,
-        swap_operands ? rhs_array_.GetBasePointer()
-                      : lhs_array_.GetBasePointer(),
-        swap_operands ? lhs_array_.GetBasePointer()
-                      : rhs_array_.GetBasePointer(),
-        target_array_.GetBasePointer(), ir_builder_);
-    emitter.Emit();
-  }
-
-  return true;
-}
-
 tensorflow::Status DotOpEmitter::Emit() {
   // The dot operation performs a sum of products over dimension 0 of the left
   // hand side operand and dimension 1 of the right hand side operand.
@@ -638,10 +105,6 @@ tensorflow::Status DotOpEmitter::Emit() {
     return EmitScalarDot();
   }
 
-  if (EmitLlvmIrDotIfProfitable()) {
-    return Status::OK();
-  }
-
   if (PotentiallyImplementedAsEigenDot(dot_)) {
     return EmitCallToRuntime();
   }
@@ -877,17 +340,22 @@ tensorflow::Status DotOpEmitter::EmitCallToRuntime() {
   //
   // Effectively this involves swapping the 'lhs' with 'rhs' and 'm' with 'n'.
 
-  MatMultDims mat_mult_dims = GetMatMultDims();
+  const Shape& lhs_shape = lhs_array_.GetShape();
+  const Shape& rhs_shape = rhs_array_.GetShape();
 
-  CHECK_EQ(mat_mult_dims.lhs_column_major, mat_mult_dims.rhs_column_major);
+  CHECK(LayoutUtil::Equal(lhs_shape.layout(), rhs_shape.layout()));
 
+  int64 m = lhs_shape.dimensions(transpose_lhs_ ? 1 : 0);
+  int64 k = lhs_shape.dimensions(transpose_lhs_ ? 0 : 1);
+  int64 n = rhs_shape.dimensions(transpose_rhs_ ? 0 : 1);
   const llvm_ir::IrArray* lhs = &lhs_array_;
   const llvm_ir::IrArray* rhs = &rhs_array_;
   bool transpose_lhs = transpose_lhs_;
   bool transpose_rhs = transpose_rhs_;
 
-  if (!mat_mult_dims.lhs_column_major) {
-    std::swap(mat_mult_dims.m, mat_mult_dims.n);
+  bool is_column_major = lhs_shape.layout().minor_to_major(0) == 0;
+  if (!is_column_major) {
+    std::swap(m, n);
     std::swap(lhs, rhs);
     std::swap(transpose_lhs, transpose_rhs);
   }
@@ -899,27 +367,12 @@ tensorflow::Status DotOpEmitter::EmitCallToRuntime() {
                                   float_ptr_type),
        ir_builder_->CreateBitCast(lhs->GetBasePointer(), float_ptr_type),
        ir_builder_->CreateBitCast(rhs->GetBasePointer(), float_ptr_type),
-       ir_builder_->getInt64(mat_mult_dims.m),
-       ir_builder_->getInt64(mat_mult_dims.n),
-       ir_builder_->getInt64(mat_mult_dims.k),
-       ir_builder_->getInt32(transpose_lhs),
+       ir_builder_->getInt64(m), ir_builder_->getInt64(n),
+       ir_builder_->getInt64(k), ir_builder_->getInt32(transpose_lhs),
        ir_builder_->getInt32(transpose_rhs)});
   return tensorflow::Status::OK();
 }
 
-DotOpEmitter::MatMultDims DotOpEmitter::GetMatMultDims() const {
-  CHECK_EQ(dot_.shape().dimensions_size(), 2);
-
-  const Shape& lhs_shape = lhs_array_.GetShape();
-  const Shape& rhs_shape = rhs_array_.GetShape();
-
-  return {lhs_shape.dimensions(transpose_lhs_ ? 1 : 0),
-          lhs_shape.dimensions(transpose_lhs_ ? 0 : 1),
-          rhs_shape.dimensions(transpose_rhs_ ? 0 : 1),
-          lhs_shape.layout().minor_to_major(0) == 0,
-          rhs_shape.layout().minor_to_major(0) == 0};
-}
-
 llvm_ir::IrArray::Index DotOpEmitter::EmitOperandArrayLoopNest(
     llvm_ir::ForLoopNest* loop_nest, const llvm_ir::IrArray& operand_array,
     int64 reduction_dimension, tensorflow::StringPiece name_suffix) {
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
index 470bf6ffb4..cfc1066045 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_DOT_OP_EMITTER_H_
 
 #include "llvm/IR/IRBuilder.h"
-#include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
@@ -60,10 +59,6 @@ class DotOpEmitter {
   // LHS and RHS) and store the results in the target.
   tensorflow::Status EmitScalarDot();
 
-  // Emit an LLVM IR implementation of the dot operation if we can.  Returns
-  // true if an LLVM IR implementation was emitted.
-  bool EmitLlvmIrDotIfProfitable();
-
   // Emits a call to the CPU runtime to perform the matrix multiply.
   tensorflow::Status EmitCallToRuntime();
 
@@ -82,38 +77,6 @@ class DotOpEmitter {
   // no padding, and a rank of two.
   bool ShapesAreLegalForRuntimeDot() const;
 
-  // Represents the dimensions of a matrix-matrix multiply operation.
-  struct MatMultDims {
-    // The number of rows in the LHS.
-    int64 m;
-
-    // The number of columns in the LHS, which is also must be equal to the
-    // number of rows in the RHS.
-    int64 k;
-
-    // The number of columns on the RHS.
-    int64 n;
-
-    // True if the LHS matrix column major.
-    bool lhs_column_major;
-
-    // True if the RHS matrix column major.
-    bool rhs_column_major;
-  };
-
-  // Get the MatMultDims instance for the dot product this DotOpEmitter
-  // represents.  Precondition: the dot is of rank 2 (and thus its operands are
-  // of rank 2 as well).
-  MatMultDims GetMatMultDims() const;
-
-  // When doing a tiled GEMV in LLVM IR, a "tile" consists of this many vector
-  // registers.
-  int64 GetGemvTilingFactor() const {
-    const int64 kDefaultTilingFactor = 8;
-    return options::LlvmIrGemvTilingFactor(hlo_module_config_)
-        .value_or(kDefaultTilingFactor);
-  }
-
   const HloInstruction& dot_;
   const bool transpose_lhs_;
   const bool transpose_rhs_;
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
index 7149a19310..b99b36a55e 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
@@ -105,9 +105,7 @@ bool PotentiallyImplementedAsEigenDot(const HloInstruction& hlo) {
       return false;
     }
 
-    if (ProfitableToImplementDotInUntiledLlvmIr(hlo) ==
-            DotInLlvmIrProfitable::kYes ||
-        ProfitableToImplementDotInTiledLlvmIr(hlo)) {
+    if (ProfitableToImplementDotInLlvmIr(hlo) == DotInLlvmIrProfitable::kYes) {
       return false;
     }
 
@@ -138,7 +136,7 @@ bool PotentiallyImplementedAsEigenDot(const HloInstruction& hlo) {
   return false;
 }
 
-DotInLlvmIrProfitable ProfitableToImplementDotInUntiledLlvmIr(
+DotInLlvmIrProfitable ProfitableToImplementDotInLlvmIr(
     const HloInstruction& dot) {
   if (dot.opcode() == HloOpcode::kDot && dot.shape().dimensions_size() == 2) {
     const Shape& result_shape = dot.shape();
@@ -180,16 +178,5 @@ DotInLlvmIrProfitable ProfitableToImplementDotInUntiledLlvmIr(
   return DotInLlvmIrProfitable::kNo;
 }
 
-bool ProfitableToImplementDotInTiledLlvmIr(const HloInstruction& dot) {
-  // Any Matrix-Vector product of floating point or integral type, or
-  // a transpose-dot fusion of the same can be lowered to a tiled LLVM
-  // IR implementation.
-  const Shape& shape = dot.shape();
-  return shape.dimensions_size() == 2 &&
-         (shape.dimensions(0) == 1 || shape.dimensions(1) == 1) &&
-         (primitive_util::IsFloatingPointType(shape.element_type()) ||
-          primitive_util::IsIntegralType(shape.element_type()));
-}
-
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h
index cbe07a7c2b..66656ed997 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h
@@ -29,21 +29,16 @@ bool PotentiallyImplementedAsEigenDot(const HloInstruction& dot);
 enum class DotInLlvmIrProfitable { kYes, kNo, kWithColumnMajorRhs };
 
 // Returns a value to indicate if (and under what conditions) will lowering
-// |dot| as a untiled LLVM IR dot operation be profitable over calling into
-// Eigen or emitting a tiled LLVM IR implementation.  Possible return values
-// are:
+// |dot| as a pure LLVM IR dot operation be profitable over calling into Eigen.
+// Possible return values are:
 //
 //  * DotInLlvmIrProfitable::kYes - always profitable.
 //  * DotInLlvmIrProfitable::kNo - never profitable.
 //  * DotInLlvmIrProfitable::kWithColumnMajorRhs - only if we can manage to make
 //    the Rhs layout column major.
-DotInLlvmIrProfitable ProfitableToImplementDotInUntiledLlvmIr(
+DotInLlvmIrProfitable ProfitableToImplementDotInLlvmIr(
     const HloInstruction& dot);
 
-// Returns true to indicate that we can generate a tiled LLVM IR implementation
-// for |dot|.
-bool ProfitableToImplementDotInTiledLlvmIr(const HloInstruction& dot);
-
 }  // namespace cpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index e547f291b8..a20ce6826c 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -1983,11 +1983,6 @@ Status IrEmitter::HandleSend(HloInstruction* send) {
   return Unimplemented("Send is not implemented on CPU. See b/33942983.");
 }
 
-Status IrEmitter::HandleSendDone(HloInstruction* send_done) {
-  // TODO(b/33942983): Support Send/Recv on CPU.
-  return Unimplemented("Send-done is not implemented on CPU. See b/33942983.");
-}
-
 Status IrEmitter::HandleSlice(HloInstruction* slice) {
   VLOG(2) << "HandleSlice: " << slice->ToString();
   auto operand = slice->operand(0);
@@ -2153,11 +2148,6 @@ Status IrEmitter::HandleRecv(HloInstruction* recv) {
   return Unimplemented("Recv is not implemented on CPU. See b/33942983.");
 }
 
-Status IrEmitter::HandleRecvDone(HloInstruction* recv_done) {
-  // TODO(b/33942983): Support Send/Recv on CPU.
-  return Unimplemented("Recv-done is not implemented on CPU. See b/33942983.");
-}
-
 Status IrEmitter::HandlePad(HloInstruction* pad) {
   // CPU backend does not properly handle negative padding but this is ok
   // because negative padding should be removed by the algebraic simplifier.
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 83eded5ad8..5d061e11e3 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -171,13 +171,11 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   Status HandleReduceWindow(HloInstruction* reduce_window) override;
   Status HandleSelectAndScatter(HloInstruction* select_and_scatter) override;
   Status HandleSend(HloInstruction* send) override;
-  Status HandleSendDone(HloInstruction* send_done) override;
   Status HandleSlice(HloInstruction* slice) override;
   Status HandleDynamicSlice(HloInstruction* dynamic_slice) override;
   Status HandleDynamicUpdateSlice(
       HloInstruction* dynamic_update_slice) override;
   Status HandleRecv(HloInstruction* recv) override;
-  Status HandleRecvDone(HloInstruction* recv_done) override;
   Status HandlePad(HloInstruction* pad) override;
   Status HandleTuple(HloInstruction* tuple) override;
   Status HandleMap(HloInstruction* map) override;
diff --git a/tensorflow/compiler/xla/service/cpu/layout_assignment.cc b/tensorflow/compiler/xla/service/cpu/layout_assignment.cc
index b75ca34e0a..c446b6b792 100644
--- a/tensorflow/compiler/xla/service/cpu/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/layout_assignment.cc
@@ -51,7 +51,7 @@ Status CpuLayoutAssignment::AddBackendConstraints(
   tensorflow::gtl::FlatMap<const HloInstruction*, bool>
       should_make_rhs_col_major_cache;
   auto should_make_rhs_col_major = [&](const HloInstruction& instruction) {
-    if (ProfitableToImplementDotInUntiledLlvmIr(instruction) !=
+    if (ProfitableToImplementDotInLlvmIr(instruction) !=
         DotInLlvmIrProfitable::kWithColumnMajorRhs) {
       return false;
     }
@@ -68,7 +68,7 @@ Status CpuLayoutAssignment::AddBackendConstraints(
 
     bool result = std::all_of(
         rhs->users().begin(), rhs->users().end(), [&](HloInstruction* user) {
-          return ProfitableToImplementDotInUntiledLlvmIr(*user) ==
+          return ProfitableToImplementDotInLlvmIr(*user) ==
                      DotInLlvmIrProfitable::kWithColumnMajorRhs &&
                  user->operand(0) != rhs;
         });
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index bc73839a88..de3cd15440 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -211,11 +211,9 @@ class DfsHloVisitorBase {
 
   virtual Status HandlePad(HloInstructionPtr hlo) = 0;
 
-  virtual Status HandleSend(HloInstructionPtr send) = 0;
-  virtual Status HandleSendDone(HloInstructionPtr send_done) = 0;
+  virtual Status HandleSend(HloInstructionPtr hlo) = 0;
 
-  virtual Status HandleRecv(HloInstructionPtr recv) = 0;
-  virtual Status HandleRecvDone(HloInstructionPtr recv_done) = 0;
+  virtual Status HandleRecv(HloInstructionPtr hlo) = 0;
 
   virtual Status HandleBatchNormTraining(HloInstructionPtr hlo) = 0;
 
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index 5415bab5b3..7ce88be89d 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -167,17 +167,11 @@ class DfsHloVisitorWithDefaultBase
   Status HandleWhile(HloInstructionPtr xla_while) override {
     return DefaultAction(xla_while);
   }
-  Status HandleRecv(HloInstructionPtr recv) override {
-    return DefaultAction(recv);
-  }
-  Status HandleRecvDone(HloInstructionPtr recv_done) override {
-    return DefaultAction(recv_done);
-  }
   Status HandleSend(HloInstructionPtr send) override {
     return DefaultAction(send);
   }
-  Status HandleSendDone(HloInstructionPtr send_done) override {
-    return DefaultAction(send_done);
+  Status HandleRecv(HloInstructionPtr recv) override {
+    return DefaultAction(recv);
   }
 
   // Invoked to inform the visitor that the traversal has completed, and that
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index e79d0a4c79..536b96dcf6 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
@@ -280,13 +279,6 @@ std::vector<AlgorithmDesc> ConvolutionThunk::GetAlgorithms(
   return algorithms;
 }
 
-static string AlgorithmToString(const se::dnn::AlgorithmDesc& algo) {
-  if (algo.tensor_ops_enabled()) {
-    return tensorflow::strings::StrCat(algo.algo_id(), "+TC");
-  }
-  return tensorflow::strings::StrCat(algo.algo_id());
-}
-
 tensorflow::Status ConvolutionThunk::ConvolveWithTune(
     const BatchDescriptor& input_descriptor, se::DeviceMemory<float> input_data,
     const FilterDescriptor& filter_descriptor,
@@ -311,8 +303,6 @@ tensorflow::Status ConvolutionThunk::ConvolveWithTune(
           buffer_allocations.device_ordinal(),
           buffer_allocations.memory_allocator());
       se::dnn::ProfileResult profile_result;
-      VLOG(3) << "Trying algorithm " << AlgorithmToString(algorithm)
-              << " for ConvolutionThunk: " << this;
       bool launch_ok =
           Convolve(input_descriptor, input_data, filter_descriptor, filter_data,
                    output_descriptor, output_data, convolution_descriptor,
@@ -320,11 +310,6 @@ tensorflow::Status ConvolutionThunk::ConvolveWithTune(
                    &scratch_allocator, &profile_result)
               .ok();
       if (launch_ok && profile_result.is_valid()) {
-        VLOG(3) << "Run of algorithm " << AlgorithmToString(algorithm)
-                << " for ConvolutionThunk " << this << " succeeded, taking "
-                << profile_result.elapsed_time_in_ms()
-                << "ms. (Best result: " << best_result.elapsed_time_in_ms()
-                << "ms)";
         if (profile_result.elapsed_time_in_ms() <
             best_result.elapsed_time_in_ms()) {
           best_result = profile_result;
@@ -334,9 +319,6 @@ tensorflow::Status ConvolutionThunk::ConvolveWithTune(
                 best_result_without_scratch.elapsed_time_in_ms()) {
           best_result_without_scratch = profile_result;
         }
-      } else {
-        VLOG(3) << "Run of algorithm " << AlgorithmToString(algorithm)
-                << " for ConvolutionThunk " << this << " failed.";
       }
     }
 
@@ -361,8 +343,8 @@ tensorflow::Status ConvolutionThunk::ConvolveWithTune(
 
   {
     VLOG(2) << "Using convolution algorithm ("
-            << AlgorithmToString(best_algorithm_.algorithm()) << ", "
-            << AlgorithmToString(best_algorithm_.algorithm_no_scratch())
+            << best_algorithm_.algorithm().algo_id() << ", "
+            << best_algorithm_.algorithm_no_scratch().algo_id()
             << ") for ConvolutionThunk: " << this;
     ConvolveScratchAllocator scratch_allocator(
         buffer_allocations.device_ordinal(),
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 187b4a705c..ceb0e530c1 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -75,7 +75,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/subprocess.h"
-#include "tensorflow/core/platform/tracing.h"
 
 namespace se = ::perftools::gputools;
 
@@ -88,7 +87,6 @@ namespace gpu {
 
 namespace {
 
-using tensorflow::port::Tracing;
 using tensorflow::strings::StrCat;
 
 // Any address of a variable residing in global memory or returned by one of the
@@ -233,7 +231,6 @@ tensorflow::Status PrepareHloModuleForIrEmitting(
 // code (i.e. a cubin) as a byte array.
 StatusOr<std::vector<uint8>> CompilePtx(const string& ptx, int cc_major,
                                         int cc_minor) {
-  Tracing::TraceMe annotation("Compile PTX", /*is_expensive=*/true);
   const string ptxas_path =
       tensorflow::io::JoinPath(tensorflow::CudaRoot(), "bin", "ptxas");
   VLOG(2) << "Using ptxas at " << ptxas_path;
@@ -298,15 +295,11 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec) {
   TF_RET_CHECK(stream_exec != nullptr);
 
-  {
-    Tracing::TraceMe annotation("HLO Transforms", module->name(),
-                                /*is_expensive=*/true);
-    TF_RETURN_IF_ERROR(OptimizeHloModule(module.get(),
-                                         stream_exec->GetDeviceDescription(),
-                                         ShapeSizeBytesFunction()));
-    TF_RETURN_IF_ERROR(
-        PrepareHloModuleForIrEmitting(module.get(), ShapeSizeBytesFunction()));
-  }
+  TF_RETURN_IF_ERROR(OptimizeHloModule(module.get(),
+                                       stream_exec->GetDeviceDescription(),
+                                       ShapeSizeBytesFunction()));
+  TF_RETURN_IF_ERROR(
+      PrepareHloModuleForIrEmitting(module.get(), ShapeSizeBytesFunction()));
 
   llvm::LLVMContext llvm_context;
   std::string buffer;
@@ -451,7 +444,6 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
 std::vector<uint8> GpuCompiler::CompilePtxOrGetCachedResult(const string& ptx,
                                                             int cc_major,
                                                             int cc_minor) {
-  Tracing::TraceMe annotation("PTX->CUBIN", /*is_expensive=*/true);
   bool inserted;
   decltype(compilation_cache_.begin()) iter;
   // Pointers into compilation_cache_ where the ptx and (optional) cubin are
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 9d55c7859d..57a3f713e3 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -128,18 +128,10 @@ Status IrEmitter::HandleSend(HloInstruction*) {
   return Unimplemented("Send is not implemented on GPU");
 }
 
-Status IrEmitter::HandleSendDone(HloInstruction*) {
-  return Unimplemented("Send-Done is not implemented on GPU");
-}
-
 Status IrEmitter::HandleRecv(HloInstruction*) {
   return Unimplemented("Recv is not implemented on GPU");
 }
 
-Status IrEmitter::HandleRecvDone(HloInstruction*) {
-  return Unimplemented("Recv-done is not implemented on GPU");
-}
-
 Status IrEmitter::HandleTuple(HloInstruction* tuple) {
   std::vector<llvm::Value*> base_ptrs;
   for (const HloInstruction* operand : tuple->operands()) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index 61fdeaa0ee..263992d925 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -84,9 +84,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   Status HandleOutfeed(HloInstruction* outfeed) override;
   Status HandleSort(HloInstruction* sort) override;
   Status HandleSend(HloInstruction* send) override;
-  Status HandleSendDone(HloInstruction* send_done) override;
   Status HandleRecv(HloInstruction* recv) override;
-  Status HandleRecvDone(HloInstruction* recv_done) override;
   Status HandleParameter(HloInstruction* parameter) override;
   Status HandleReduce(HloInstruction* reduce) override;
   Status HandleTuple(HloInstruction* tuple) override;
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 1cb963be61..817e95a31c 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -60,7 +60,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/tracing.h"
 
 namespace xla {
 namespace gpu {
@@ -489,9 +488,6 @@ StatusOr<string> CompileToPtx(llvm::Module* module,
 
   string ptx;
   {
-    tensorflow::port::Tracing::TraceMe annotation(
-        "Compiling IR", llvm_ir::AsString(module->getName()),
-        /*is_expensive=*/true);
     ScopedLoggingTimer compilation_timer(
         "Compile module " + llvm_ir::AsString(module->getName()),
         /*vlog_level=*/2);
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 1877065f67..17ba2b673a 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -337,18 +337,10 @@ Status HloCostAnalysis::HandleSend(const HloInstruction*) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleSendDone(const HloInstruction*) {
-  return Status::OK();
-}
-
 Status HloCostAnalysis::HandleRecv(const HloInstruction*) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleRecvDone(const HloInstruction*) {
-  return Status::OK();
-}
-
 Status HloCostAnalysis::HandleReshape(const HloInstruction*) {
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index 0f44775378..8074868e37 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -60,9 +60,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleReducePrecision(const HloInstruction* hlo) override;
   Status HandleConcatenate(const HloInstruction* concatenate) override;
   Status HandleSend(const HloInstruction* send) override;
-  Status HandleSendDone(const HloInstruction* send_done) override;
   Status HandleRecv(const HloInstruction* recv) override;
-  Status HandleRecvDone(const HloInstruction* recv_done) override;
   Status HandleConvert(const HloInstruction* convert) override;
   Status HandleCopy(const HloInstruction* copy) override;
   Status HandleDot(const HloInstruction* dot) override;
diff --git a/tensorflow/compiler/xla/service/hlo_cse_test.cc b/tensorflow/compiler/xla/service/hlo_cse_test.cc
index 3601a790c4..7c4626e78a 100644
--- a/tensorflow/compiler/xla/service/hlo_cse_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse_test.cc
@@ -79,12 +79,12 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndInsensitive) {
   // Test that two identical constants with different layouts are commoned if
   // the pass is not layout sensitive.
   auto builder = HloComputation::Builder(TestName());
-  auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR2WithLayout<float>(
-          {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1}))));
-  auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR2WithLayout<float>(
-          {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({1, 0}))));
+  auto constant1 = builder.AddInstruction(HloInstruction::CreateConstant(
+      test_utils::CreateR2LiteralWithLayout<float>({{1.0, 2.0}, {3.0, 4.0}},
+                                                   /*minor_to_major=*/{0, 1})));
+  auto constant2 = builder.AddInstruction(HloInstruction::CreateConstant(
+      test_utils::CreateR2LiteralWithLayout<float>({{1.0, 2.0}, {3.0, 4.0}},
+                                                   /*minor_to_major=*/{1, 0})));
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
@@ -111,12 +111,12 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndSensitive) {
   // Test that two identical constants with different layouts are *not* commoned
   // if the pass is layout sensitive.
   auto builder = HloComputation::Builder(TestName());
-  auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR2WithLayout<float>(
-          {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1}))));
-  auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR2WithLayout<float>(
-          {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({1, 0}))));
+  auto constant1 = builder.AddInstruction(HloInstruction::CreateConstant(
+      test_utils::CreateR2LiteralWithLayout<float>({{1.0, 2.0}, {3.0, 4.0}},
+                                                   /*minor_to_major=*/{0, 1})));
+  auto constant2 = builder.AddInstruction(HloInstruction::CreateConstant(
+      test_utils::CreateR2LiteralWithLayout<float>({{1.0, 2.0}, {3.0, 4.0}},
+                                                   /*minor_to_major=*/{1, 0})));
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index ff80f18bb5..92261bce62 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -242,51 +242,6 @@ bool HloDataflowAnalysis::UpdateBitcastValueSet(HloInstruction* bitcast) {
   return false;
 }
 
-bool HloDataflowAnalysis::UpdateSendValueSet(HloInstruction* send) {
-  CHECK_EQ(send->opcode(), HloOpcode::kSend);
-  bool changed = false;
-  // Send forwards the operand value to the output tuple at {0}.
-  for (auto& pair : GetInstructionValueSet(send->operand(0))) {
-    const ShapeIndex& operand_index = pair.first;
-    const HloValueSet& operand_value_set = pair.second;
-
-    ShapeIndex index = {0};
-    for (int64 i : operand_index) {
-      index.push_back(i);
-    }
-
-    HloValueSet& value_set = GetValueSet(send, index);
-    if (value_set != operand_value_set) {
-      value_set = operand_value_set;
-      changed = true;
-    }
-  }
-  return changed;
-}
-
-bool HloDataflowAnalysis::UpdateRecvDoneValueSet(HloInstruction* recv_done) {
-  CHECK_EQ(recv_done->opcode(), HloOpcode::kRecvDone);
-  bool changed = false;
-  // RecvDone forwards the operand value at {0} to the output.
-  for (auto& pair : GetInstructionValueSet(recv_done)) {
-    ShapeIndex& index = pair.first;
-    HloValueSet& value_set = pair.second;
-
-    ShapeIndex operand_index = {0};
-    for (int64 i : index) {
-      operand_index.push_back(i);
-    }
-
-    const HloValueSet& operand_value_set =
-        GetValueSet(recv_done->operand(0), operand_index);
-    if (value_set != operand_value_set) {
-      value_set = operand_value_set;
-      changed = true;
-    }
-  }
-  return changed;
-}
-
 bool HloDataflowAnalysis::UpdateCallValueSet(HloInstruction* call) {
   CHECK_EQ(call->opcode(), HloOpcode::kCall);
   InstructionValueSet& value_set = GetInstructionValueSet(call);
@@ -474,10 +429,6 @@ bool HloDataflowAnalysis::UpdateInstructionValueSet(
       return UpdateCallValueSet(instruction);
     case HloOpcode::kWhile:
       return UpdateWhileValueSet(instruction);
-    case HloOpcode::kSend:
-      return UpdateSendValueSet(instruction);
-    case HloOpcode::kRecvDone:
-      return UpdateRecvDoneValueSet(instruction);
     default:
       // Instruction does not forward HloValues (it defines all values in its
       // output). No update is necessary.
@@ -586,12 +537,6 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
         GetValueSet(instruction, /*index=*/{}).AddValue(value);
       };
 
-      // Lambda to set the value set at the given index of the output.
-      auto define_value_at = [this, &instruction](const ShapeIndex& index) {
-        HloValue* value = NewHloValue(instruction, index, /*is_phi=*/false);
-        GetValueSet(instruction, index).AddValue(value);
-      };
-
       switch (instruction->opcode()) {
         case HloOpcode::kBitcast:
           if (bitcast_defines_value_) {
@@ -632,16 +577,6 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
           // values flow from their operands.
           define_top_level_only();
           break;
-        case HloOpcode::kRecvDone:
-          // RecvDone aliases its input tuple element {0}, therefore does not
-          // define any values.
-          break;
-        case HloOpcode::kSend:
-          // Send produces a tuple of {aliased operand, U32 context}, therefore
-          // only defines the top-level tuple and the tuple element at {1}.
-          define_value_at(/*index=*/{});
-          define_value_at(/*index=*/{1});
-          break;
         default:
           define_all_values();
           break;
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
index 63467f3206..207e553bf7 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
@@ -146,9 +146,7 @@ class HloDataflowAnalysis {
   bool UpdateCopyValueSet(HloInstruction* copy);
   bool UpdateGetTupleElementValueSet(HloInstruction* gte);
   bool UpdateParameterValueSet(HloInstruction* parameter);
-  bool UpdateRecvDoneValueSet(HloInstruction* recv_done);
   bool UpdateSelectValueSet(HloInstruction* select);
-  bool UpdateSendValueSet(HloInstruction* send);
   bool UpdateTupleValueSet(HloInstruction* tuple);
   bool UpdateWhileValueSet(HloInstruction* xla_while);
 
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 66a538fc51..4b8eb237a6 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -1139,54 +1139,6 @@ TEST_P(HloDataflowAnalysisTest, TupleCopy) {
       analysis.GetValueDefinedAt(copy, /*index=*/{}).live_out_of_module());
 }
 
-TEST_P(HloDataflowAnalysisTest, SendAndSendDone) {
-  // Test that a Send forwards its operand to the output tuple at {0}.
-  auto builder = HloComputation::Builder(TestName());
-  auto param = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, scalar_shape_, "param0"));
-  auto send = builder.AddInstruction(
-      HloInstruction::CreateSend(param, /*channel_id=*/0));
-  auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send));
-  module_->AddEntryComputation(builder.Build());
-
-  bool ssa_form = GetParam();
-  const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
-
-  EXPECT_EQ(analysis.values().size(), 4);
-
-  EXPECT_TRUE(analysis.ValueIsDefinedAt(param));
-  EXPECT_TRUE(analysis.ValueIsDefinedAt(send, /*index=*/{}));
-  EXPECT_FALSE(analysis.ValueIsDefinedAt(send, /*index=*/{0}));
-  EXPECT_TRUE(analysis.ValueIsDefinedAt(send, /*index=*/{1}));
-  EXPECT_TRUE(analysis.ValueIsDefinedAt(send_done));
-  EXPECT_THAT(HloValuesAt(send, /*index=*/{0}),
-              UnorderedElementsAre(analysis.GetValueDefinedAt(param)));
-}
-
-TEST_P(HloDataflowAnalysisTest, RecvAndRecvDone) {
-  // Test that a RecvDone forwards its operand tuple element at {0} to the
-  // output.
-  auto builder = HloComputation::Builder(TestName());
-  auto recv = builder.AddInstruction(
-      HloInstruction::CreateRecv(scalar_shape_, /*channel_id=*/0));
-  auto recv_done = builder.AddInstruction(HloInstruction::CreateRecvDone(recv));
-  module_->AddEntryComputation(builder.Build());
-
-  bool ssa_form = GetParam();
-  const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
-
-  EXPECT_EQ(analysis.values().size(), 3);
-
-  EXPECT_TRUE(analysis.ValueIsDefinedAt(recv, /*index=*/{}));
-  EXPECT_TRUE(analysis.ValueIsDefinedAt(recv, /*index=*/{0}));
-  EXPECT_TRUE(analysis.ValueIsDefinedAt(recv, /*index=*/{1}));
-  EXPECT_FALSE(analysis.ValueIsDefinedAt(recv_done));
-  EXPECT_THAT(HloValuesAt(recv_done),
-              UnorderedElementsAre(analysis.GetValueDefinedAt(recv, {0})));
-  EXPECT_TRUE(
-      analysis.GetValueDefinedAt(recv, /*index=*/{0}).live_out_of_module());
-}
-
 TEST_P(HloDataflowAnalysisTest, ElementwiseChainInterference) {
   // A simple chain of elementwise operations. No values should interfere.
   //
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index a722d1b3d9..88b77ccdd0 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -1450,10 +1450,6 @@ HloEvaluator::HloEvaluator() {
   typed_visitors_[F32] = MakeUnique<TypedVisitor<float>>(this);
   typed_visitors_[F64] = MakeUnique<TypedVisitor<double>>(this);
   typed_visitors_[C64] = MakeUnique<TypedVisitor<complex64>>(this);
-
-  typed_visitors_[BF16] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
-    return Unimplemented("HloEvaluator: unhandled primitive type: BF16.");
-  });
   typed_visitors_[TUPLE] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
     return Unimplemented("HloEvaluator: unhandled primitive type: TUPLE.");
   });
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index 7557aaa248..67b6e215fc 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -39,18 +39,16 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   HloEvaluator();
   // Evaluates an HLO module and an array of pointers to literals.
   // Returns the evaluated result as a literal if successful.
-  // Precondition: The indices of arg_literals correspond to the parameter
-  // numbers of the HLO parameters in the computation. See comment below for an
-  // example.
+  // Precondition: argument literals correspond to each input computation's
+  // parameters in their post-ordering. See comment below for example.
   StatusOr<std::unique_ptr<Literal>> Evaluate(
       const HloModule& module,
       tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
 
   // Evaluates an HLO computation and an array of pointers to literals.
   // Returns the evaluated result as a literal if successful.
-  // Precondition: The indices of arg_literals correspond to the parameter
-  // numbers of the HLO parameters in the computation. For e.g., consider the
-  // following graph:
+  // Precondition: argument literals correspond to the input computation's
+  // parameters in their post-ordering. For e.g., consider the following graph:
   //
   //                *
   //            /       \
@@ -59,9 +57,8 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   //       /        \
   //    Parameter0  Constant
   //
-  // where Parameter0 has parameter_number 0 and Parameter1 has parameter_number
-  // 1 in this computation. The input literals array will then have its first
-  // literal map to Parameter0 and the second map to Parameter1.
+  // The input literals array will have its first literal map to Parameter0 and
+  // the second map to Parameter1.
   StatusOr<std::unique_ptr<Literal>> Evaluate(
       const HloComputation& computation,
       tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 04b3059fb1..fd162622ce 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -761,22 +761,12 @@ string HloDotDumper::DumpInstruction(const HloInstruction* instr) {
 string HloDotDumper::GetInstructionNodeInlinedOperands(
     const HloInstruction* instr) {
   auto stringify_constant = [](const HloInstruction* constant) {
-    const auto& shape = constant->shape();
-
-    // Print the literal value of constants with <= K elements.
-    optional<int64> elem_count;
-    if (!ShapeUtil::IsOpaque(shape) && !ShapeUtil::IsTuple(shape)) {
-      elem_count = 1;
-      for (int64 dim : shape.dimensions()) {
-        *elem_count *= dim;
-      }
-    }
-    if (elem_count.has_value() && *elem_count <= 8) {
-      return Printf("%s (%s)", constant->literal().ToString(),
+    if (ShapeUtil::IsEffectiveScalar(constant->shape())) {
+      auto elem_idx = IndexUtil::LinearIndexToMultidimensionalIndex(
+          constant->shape(), /*linear_index=*/0);
+      return Printf("%s (%s)", constant->literal().GetAsString(elem_idx),
                     ShapeUtil::HumanString(constant->shape()));
     }
-
-    // Otherwise, print e.g. "%constant.42 (s32[100])".
     string constant_name;
     if (tensorflow::StringPiece(constant->name()).starts_with("%constant")) {
       constant_name = constant->name();
@@ -943,9 +933,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kFusion:
       return kGray;
     case HloOpcode::kSend:
-    case HloOpcode::kSendDone:
     case HloOpcode::kRecv:
-    case HloOpcode::kRecvDone:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kCrossReplicaSum:
@@ -1039,9 +1027,7 @@ string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) {
                    ? ""
                    : StrCat("stride=", VectorString(instr->slice_strides()));
       case HloOpcode::kSend:
-      case HloOpcode::kSendDone:
       case HloOpcode::kRecv:
-      case HloOpcode::kRecvDone:
         return StrCat("channel_id=", instr->channel_id());
       default:
         return "";
@@ -1303,9 +1289,7 @@ NodeFilter MakeNodeFilter(const HloInstruction* root, int64 radius) {
 
   auto is_displayed = [&](const HloInstruction* instr) {
     // Constants are displayed inline with their users; they're never omitted.
-    // Nodes in subcomputations are always shown.
-    return nodes.count(instr) > 0 || instr->opcode() == HloOpcode::kConstant ||
-           instr->parent() != root->parent();
+    return nodes.count(instr) > 0 || instr->opcode() == HloOpcode::kConstant;
   };
 
   // Make a second pass over 'nodes' to fix up the NodeFilterResults now that we
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 674d3e3836..5107ac782d 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -371,50 +371,20 @@ HloInstruction::CreateCrossReplicaSum(const Shape& shape,
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateSend(
     HloInstruction* operand, int64 channel_id) {
-  // Send instruction produces a tuple of {aliased operand, U32 context}.
-  Shape output_shape = ShapeUtil::MakeTupleShape(
-      {operand->shape(), ShapeUtil::MakeShape(U32, {})});
   auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kSend, output_shape));
+      WrapUnique(new HloInstruction(HloOpcode::kSend, ShapeUtil::MakeNil()));
   instruction->AppendOperand(operand);
   instruction->channel_id_ = channel_id;
   return instruction;
 }
 
-/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateSendDone(
-    HloInstruction* operand) {
-  CHECK(operand->opcode() == HloOpcode::kSend)
-      << "SendDone must take the context operand from Send";
-  auto instruction = WrapUnique(
-      new HloInstruction(HloOpcode::kSendDone, ShapeUtil::MakeNil()));
-  instruction->AppendOperand(operand);
-  instruction->channel_id_ = operand->channel_id();
-  return instruction;
-}
-
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateRecv(
     const Shape& shape, int64 channel_id) {
-  // Recv instruction produces a tuple of {receive buffer, U32 context}.
-  Shape output_shape =
-      ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U32, {})});
-  auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kRecv, output_shape));
+  auto instruction = WrapUnique(new HloInstruction(HloOpcode::kRecv, shape));
   instruction->channel_id_ = channel_id;
   return instruction;
 }
 
-/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateRecvDone(
-    HloInstruction* operand) {
-  CHECK(operand->opcode() == HloOpcode::kRecv)
-      << "RecvDone must take the context operand from Recv";
-  Shape output_shape = ShapeUtil::GetTupleElementShape(operand->shape(), 0);
-  auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kRecvDone, output_shape));
-  instruction->AppendOperand(operand);
-  instruction->channel_id_ = operand->channel_id();
-  return instruction;
-}
-
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateReverse(
     const Shape& shape, HloInstruction* operand,
     tensorflow::gtl::ArraySlice<int64> dimensions) {
@@ -938,9 +908,7 @@ RandomDistribution HloInstruction::random_distribution() const {
 bool HloInstruction::HasSideEffect() const {
   switch (opcode_) {
     case HloOpcode::kSend:
-    case HloOpcode::kSendDone:
     case HloOpcode::kRecv:
-    case HloOpcode::kRecvDone:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kTrace:
@@ -1196,9 +1164,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
                                   new_operands[4], epsilon(), feature_index());
       break;
     case HloOpcode::kRecv:
-    case HloOpcode::kRecvDone:
     case HloOpcode::kSend:
-    case HloOpcode::kSendDone:
     case HloOpcode::kTrace:
       LOG(FATAL) << "Not yet implemented, clone: " << HloOpcodeString(opcode_);
   }
@@ -1591,10 +1557,8 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kSort:
-    case HloOpcode::kRecv:
-    case HloOpcode::kRecvDone:
     case HloOpcode::kSend:
-    case HloOpcode::kSendDone:
+    case HloOpcode::kRecv:
       return false;
   }
 }
@@ -1886,13 +1850,12 @@ std::vector<string> HloInstruction::ExtraAttributesToString() const {
     extra.push_back(StrCat("dimensions={", Join(dimensions(), ","), "}"));
   }
   if (window_ != nullptr) {
-    extra.push_back(StrCat("window={", window_util::ToString(*window_), "}"));
+    extra.push_back(window_util::ToString(*window_));
   }
   if (padding_config_ != nullptr) {
-    extra.push_back(
-        StrCat("padding=", xla::PaddingConfigToString(*padding_config_)));
+    extra.push_back(StrCat("padding=", padding_config_->ShortDebugString()));
   }
-  if (opcode() == HloOpcode::kSlice) {
+  if (!slice_starts_.empty() && !slice_limits_.empty()) {
     std::vector<string> bounds;
     bounds.reserve(slice_starts_.size());
     const bool omit_stride =
@@ -1905,16 +1868,6 @@ std::vector<string> HloInstruction::ExtraAttributesToString() const {
     }
     extra.push_back(StrCat("slice={", Join(bounds, ", "), "}"));
   }
-  if (opcode() == HloOpcode::kDynamicSlice) {
-    extra.push_back(
-        StrCat("dynamic_slice_sizes={", Join(dynamic_slice_sizes(), ","), "}"));
-  }
-  if (opcode() == HloOpcode::kBatchNormTraining ||
-      opcode() == HloOpcode::kBatchNormInference ||
-      opcode() == HloOpcode::kBatchNormGrad) {
-    extra.push_back(StrCat("epsilon=", epsilon()));
-    extra.push_back(StrCat("feature_index=", feature_index()));
-  }
 
   if (convolution_dimension_numbers_ != nullptr) {
     extra.push_back(ConvolutionDimensionNumbersToString());
@@ -1938,8 +1891,7 @@ std::vector<string> HloInstruction::ExtraAttributesToString() const {
                        })));
   }
 
-  if (opcode() == HloOpcode::kSend || opcode() == HloOpcode::kRecv ||
-      opcode() == HloOpcode::kSendDone || opcode() == HloOpcode::kRecvDone) {
+  if (opcode() == HloOpcode::kSend || opcode() == HloOpcode::kRecv) {
     extra.push_back(StrCat("channel_id=", channel_id_));
   }
 
@@ -2119,10 +2071,8 @@ bool HloInstruction::IsFusable() const {
     case HloOpcode::kOutfeed:
     case HloOpcode::kParameter:
     case HloOpcode::kTrace:
-    case HloOpcode::kRecv:
-    case HloOpcode::kRecvDone:
     case HloOpcode::kSend:
-    case HloOpcode::kSendDone:
+    case HloOpcode::kRecv:
       return false;
     // Only fuse Rng if it is used once, otherwise the random numbers generated
     // will be different in each fusion. If it is the root (user count = 0)
@@ -2329,14 +2279,10 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleCall(this);
     case HloOpcode::kCustomCall:
       return visitor->HandleCustomCall(this);
-    case HloOpcode::kRecv:
-      return visitor->HandleRecv(this);
-    case HloOpcode::kRecvDone:
-      return visitor->HandleRecvDone(this);
     case HloOpcode::kSend:
       return visitor->HandleSend(this);
-    case HloOpcode::kSendDone:
-      return visitor->HandleSendDone(this);
+    case HloOpcode::kRecv:
+      return visitor->HandleRecv(this);
 
     // These opcodes are not handled here.
     case HloOpcode::kTrace:
@@ -2895,21 +2841,6 @@ StatusOr<HloInstruction::FusionKind> StringToFusionKind(
   return InvalidArgument("Unknown fusion kind: %s", kind_name.c_str());
 }
 
-string PaddingConfigToString(const PaddingConfig& padding) {
-  bool has_interior_padding =
-      std::any_of(padding.dimensions().begin(), padding.dimensions().end(),
-                  [](const PaddingConfig::PaddingConfigDimension& dim) {
-                    return dim.interior_padding() != 0;
-                  });
-  return Join(
-      padding.dimensions(), "x",
-      [&](string* out, const PaddingConfig::PaddingConfigDimension& dim) {
-        StrAppend(
-            out, dim.edge_padding_low(), "_", dim.edge_padding_high(),
-            has_interior_padding ? StrCat("_", dim.interior_padding()) : "");
-      });
-}
-
 std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind) {
   return os << ToString(kind);
 }
@@ -2925,7 +2856,13 @@ string HloInstruction::ConvolutionDimensionNumbersToString() const {
   const auto append_dims = [&](const std::vector<string>& dims,
                                const Shape& shape) {
     CHECK_EQ(dims.size(), ShapeUtil::Rank(shape));
-    StrAppend(&result, Join(dims, ""));
+    for (int64 logical = 0; logical < dims.size(); ++logical) {
+      int64 physical = logical;
+      if (!shape.layout().minor_to_major().empty()) {
+        physical = LayoutUtil::Major(shape.layout(), logical);
+      }
+      result += dims[physical];
+    }
   };
 
   // lhs_dims[i] is the symbol of the logical dimension i for the lhs
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index d174f05aa6..5ff04a4888 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -181,28 +181,18 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand,
       tensorflow::StringPiece outfeed_config);
 
-  // Creates an asynchronous send instruction with the given channel id, which
-  // initiates sending the operand data to a unique receive instruction in
-  // another computation that has the same channel id.
+  // Creates a send instruction with the given channel id, which sends the
+  // operand data to a unique receive instruction in another computation that
+  // has the same channel id.
   static std::unique_ptr<HloInstruction> CreateSend(HloInstruction* operand,
                                                     int64 channel_id);
 
-  // Blocks until data transfer for the Send instruction (operand) is complete.
-  // The operand must be kSend.
-  static std::unique_ptr<HloInstruction> CreateSendDone(
-      HloInstruction* operand);
-
-  // Creates an asynchronous receive instruction with the given channel id,
-  // which allocates resources to receive data of the given shape from a unique
-  // send instruction in another computation that has the same channel id.
+  // Creates a receive instruction with the given channel id, which receives
+  // data of the given shape from a unique send instruction in another
+  // computation that has the same channel id.
   static std::unique_ptr<HloInstruction> CreateRecv(const Shape& shape,
                                                     int64 channel_id);
 
-  // Blocks until data transfer for the Recv instruction (operand) is complete
-  // and returns the receive buffer. The operand must be kRecv.
-  static std::unique_ptr<HloInstruction> CreateRecvDone(
-      HloInstruction* operand);
-
   // Creates a slice instruction, where the operand is sliced by the given
   // start/limit indices.
   static std::unique_ptr<HloInstruction> CreateSlice(
@@ -212,7 +202,7 @@ class HloInstruction {
       tensorflow::gtl::ArraySlice<int64> strides);
 
   // Creates a slice instruction, where the first operand is sliced by
-  // start indices specified in the second operand, and by size specfied in
+  // start indices specified in the second operand, and by size specified in
   // 'slice_sizes'.
   static std::unique_ptr<HloInstruction> CreateDynamicSlice(
       const Shape& shape, HloInstruction* operand,
@@ -863,11 +853,6 @@ class HloInstruction {
     return *window_;
   }
 
-  // Sets the window data in a windowed operation such as convolution.
-  void set_window(const Window& window) {
-    window_ = MakeUnique<Window>(window);
-  }
-
   // Returns the padding configuration for a pad node.
   //
   // Precondition: opcode() == HloOpcode::kPad
@@ -1239,8 +1224,6 @@ string ToString(HloInstruction::FusionKind kind);
 StatusOr<HloInstruction::FusionKind> StringToFusionKind(
     const string& kind_name);
 
-string PaddingConfigToString(const PaddingConfig& padding);
-
 std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind);
 
 // Map classes that guarantee a deterministic iteration order when the key is
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index 4ead64d997..ddb623332c 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -792,8 +792,8 @@ TEST_F(HloInstructionTest, ComplexFusionOp) {
   //   sub = Sub(mul, clamp)
   //   tuple = Tuple({sub, sub, mul, C1})
   //
-  // Notable complexities are repeated operands in a same instruction, different
-  // shapes, use of value in different expressions.
+  // Notable complexities are repeated operands in the same instruction,
+  // different shapes, use of value in different expressions.
   auto c1 = builder.AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f)));
   auto c2 = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 268fa0f632..4d4010b025 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -121,7 +121,6 @@ HLO_MATCHER(Outfeed);
 HLO_MATCHER(Pad);
 HLO_MATCHER(Power);
 HLO_MATCHER(Recv);
-HLO_MATCHER(RecvDone);
 HLO_MATCHER(Reduce);
 HLO_MATCHER(ReducePrecision);
 HLO_MATCHER(ReduceWindow);
@@ -132,7 +131,6 @@ HLO_MATCHER(Rng);
 HLO_MATCHER(Select);
 HLO_MATCHER(SelectAndScatter);
 HLO_MATCHER(Send);
-HLO_MATCHER(SendDone);
 HLO_MATCHER(ShiftLeft);
 HLO_MATCHER(ShiftRightLogical);
 HLO_MATCHER(ShiftRightArithmetic);
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index 5141e7bc8d..6469851791 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -85,11 +85,7 @@ class HloModule {
   std::unique_ptr<HloModule> Clone(const string& suffix = "clone") const;
 
   // Return a pointer to the entry computation of the module..
-  const HloComputation* entry_computation() const {
-    CHECK_NE(nullptr, entry_computation_);
-    return entry_computation_;
-  }
-  HloComputation* entry_computation() {
+  HloComputation* entry_computation() const {
     CHECK_NE(nullptr, entry_computation_);
     return entry_computation_;
   }
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.cc b/tensorflow/compiler/xla/service/hlo_module_config.cc
index 822e2f1f53..8974deb530 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_config.cc
@@ -39,8 +39,8 @@ void HloModuleConfig::SetDefaultComputationLayout(
 }
 
 string HloModuleConfig::compilation_cache_key() const {
-  string key =
-      tensorflow::strings::StrCat("profiling=", hlo_profiling_enabled_);
+  string key = tensorflow::strings::StrCat("profiling=", hlo_profiling_enabled_,
+                                           "::hybrid=", has_hybrid_result_);
   StrAppend(&key, "::(");
   std::vector<string> params;
   for (const ShapeLayout& param_layout :
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h
index a5ee895e48..4a7ead9c10 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.h
+++ b/tensorflow/compiler/xla/service/hlo_module_config.h
@@ -104,6 +104,16 @@ class HloModuleConfig {
   // Whether to enable HLO-level profiling.
   bool hlo_profiling_enabled_ = false;
 
+  // If this flag is true, the generated executable will return a ShapedBuffer
+  // holding the result of the computation. In a ShapedBuffer, tuples have their
+  // structure held in host memory and the element arrays (leaves of the tuple
+  // structure) stored in device memory. The ShapedBuffer is considered "hybrid"
+  // because its leaves are on device but its structure is stored on
+  // host. Otherwise, if this flag is false, the generated executable will
+  // return a DeviceMemoryBase where the result is held entirely in device
+  // memory.
+  bool has_hybrid_result_ = false;
+
   // Module/graph-level seed handle.
   uint64 seed_ = 0;
 
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index e0d02e0665..d68fc20321 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -97,7 +97,6 @@ namespace xla {
   V(kPower, "power")                                         \
   V(kReal, "real")                                           \
   V(kRecv, "recv")                                           \
-  V(kRecvDone, "recv-done")                                  \
   V(kReduce, "reduce")                                       \
   V(kReducePrecision, "reduce-precision")                    \
   V(kReduceWindow, "reduce-window")                          \
@@ -109,7 +108,6 @@ namespace xla {
   V(kSelect, "select")                                       \
   V(kSelectAndScatter, "select-and-scatter")                 \
   V(kSend, "send")                                           \
-  V(kSendDone, "send-done")                                  \
   V(kShiftLeft, "shift-left")                                \
   V(kShiftRightArithmetic, "shift-right-arithmetic")         \
   V(kShiftRightLogical, "shift-right-logical")               \
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 828be8490c..c96df50e79 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -66,9 +66,7 @@ bool IsRematerializable(const HloInstruction* instruction) {
     case HloOpcode::kInfeed:
     case HloOpcode::kParameter:
     case HloOpcode::kRecv:
-    case HloOpcode::kRecvDone:
     case HloOpcode::kSend:
-    case HloOpcode::kSendDone:
     case HloOpcode::kTrace:
     case HloOpcode::kWhile:
       return false;
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 158fb9a546..f463e57d99 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#define EIGEN_USE_THREADS
 
 #include "tensorflow/compiler/xla/service/hlo_runner.h"
 
@@ -20,6 +19,8 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#define EIGEN_USE_THREADS
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index 7356663454..0d019d22f5 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
 
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace xla {
 
@@ -39,15 +38,6 @@ HloSharding HloSharding::Tile1D(const Shape& input_shape, int64 num_tiles) {
 }
 
 string HloSharding::ToString() const {
-  if (IsTuple()) {
-    std::vector<string> parts;
-    parts.reserve(tuple_elements_.size());
-    for (const HloSharding& element : tuple_elements_) {
-      parts.push_back(element.ToString());
-    }
-    return StrCat("{", tensorflow::str_util::Join(parts, ", "), "}");
-  }
-
   string result = StrCat("{", (replicated_ ? " replicated" : ""),
                          (maximal_ ? " maximal" : ""));
 
@@ -63,11 +53,6 @@ string HloSharding::ToString() const {
 }
 
 bool HloSharding::UsesDevice(int64 device) const {
-  if (IsTuple()) {
-    return std::any_of(
-        tuple_elements_.begin(), tuple_elements_.end(),
-        [&](const HloSharding& s) { return s.UsesDevice(device); });
-  }
   const auto& devices = tile_assignment_;
   return replicated_ ||
          std::find(devices.begin(), devices.end(), device) != devices.end();
@@ -76,7 +61,6 @@ bool HloSharding::UsesDevice(int64 device) const {
 std::vector<int64> HloSharding::TileIndexForDevice(int64 device) const {
   CHECK(!ShapeUtil::IsTuple(tile_shape_));
   CHECK(!maximal_);
-  CHECK(!IsTuple());
   std::vector<int64> ret_index;
   tile_assignment_.Each([&](tensorflow::gtl::ArraySlice<int64> index, int64 d) {
     if (d == device) {
@@ -90,7 +74,6 @@ std::vector<int64> HloSharding::TileIndexForDevice(int64 device) const {
 int64 HloSharding::DeviceForTileIndex(
     tensorflow::gtl::ArraySlice<int64> index) const {
   CHECK(!replicated_);
-  CHECK(!IsTuple());
   if (maximal_) {
     return *tile_assignment_.begin();
   }
@@ -99,7 +82,7 @@ int64 HloSharding::DeviceForTileIndex(
 }
 
 std::vector<int64> HloSharding::TileOffsetForDevice(int64 device) const {
-  CHECK(!IsTuple());
+  CHECK(!ShapeUtil::IsTuple(tile_shape_));
 
   std::vector<int64> index = TileIndexForDevice(device);
   if (maximal_) {
@@ -114,7 +97,7 @@ std::vector<int64> HloSharding::TileOffsetForDevice(int64 device) const {
 }
 
 std::vector<int64> HloSharding::TileLimitForDevice(int64 device) const {
-  CHECK(!IsTuple());
+  CHECK(!ShapeUtil::IsTuple(tile_shape_));
   CHECK(!maximal_);  // Maximal shardings do not have a valid tile shape.
 
   std::vector<int64> index = TileIndexForDevice(device);
@@ -125,41 +108,13 @@ std::vector<int64> HloSharding::TileLimitForDevice(int64 device) const {
 }
 
 StatusOr<int64> HloSharding::UniqueDevice() const {
-  if (IsTuple()) {
-    if (tuple_elements_.empty()) {
-      return tensorflow::errors::InvalidArgument(
-          "UniqueDevice() called on empty tuple");
-    }
-    std::vector<StatusOr<int64>> results;
-    std::transform(tuple_elements_.begin(), tuple_elements_.end(),
-                   std::back_inserter(results),
-                   [](const HloSharding& s) { return s.UniqueDevice(); });
-    if (std::all_of(results.begin(), results.end(),
-                    [&](const StatusOr<int64>& s) {
-                      return s.ok() && results[0].ok() &&
-                             s.ValueOrDie() == results[0].ValueOrDie();
-                    })) {
-      return results[0];
-    } else {
-      return tensorflow::errors::InvalidArgument(
-          "Tuple did not contain a unique device");
-    }
-  }
-  if (!replicated_ && maximal_ && !IsTuple()) {
+  if (!replicated_ && maximal_) {
     return static_cast<int64>(*tile_assignment_.begin());
   }
   return tensorflow::errors::InvalidArgument(
       "UniqueDevice() called on sharding that executes on multiple devices");
 }
 
-bool HloSharding::HasUniqueDevice() const {
-  if (IsTuple()) {
-    return UniqueDevice().status().ok();
-  } else {
-    return !IsReplicated() && IsTileMaximal();
-  }
-}
-
 Status HloSharding::Validate(const Shape& shape, int64 num_devices) const {
   if (replicated_) {
     return Status::OK();
@@ -238,19 +193,9 @@ Status HloSharding::Validate(const Shape& shape, int64 num_devices) const {
 
 /*static*/ StatusOr<HloSharding> HloSharding::FromProto(
     const OpSharding& proto) {
-  if (proto.type() == OpSharding::Type::OpSharding_Type_TUPLE) {
-    std::vector<HloSharding> tuple_shardings;
-    tuple_shardings.reserve(proto.tuple_shardings().size());
-    for (const OpSharding& tuple_sharding_proto : proto.tuple_shardings()) {
-      TF_ASSIGN_OR_RETURN(HloSharding sharding,
-                          HloSharding::FromProto(tuple_sharding_proto));
-      tuple_shardings.push_back(sharding);
-    }
-    return HloSharding(tuple_shardings);
-  } else if (proto.type() == OpSharding::Type::OpSharding_Type_REPLICATED) {
+  if (proto.type() == OpSharding::Type::OpSharding_Type_REPLICATED) {
     return Replicate();
-  } else if (proto.type() == OpSharding::Type::OpSharding_Type_MAXIMAL ||
-             proto.tile_assignment_devices().size() == 1) {
+  } else if (proto.type() == OpSharding::Type::OpSharding_Type_MAXIMAL) {
     return HloSharding(proto.tile_assignment_devices(0));
   }
   // Some versions of gcc cannot infer the TileAssignment constructor from a
@@ -267,15 +212,6 @@ Status HloSharding::Validate(const Shape& shape, int64 num_devices) const {
 
 OpSharding HloSharding::ToProto() const {
   OpSharding result;
-
-  if (IsTuple()) {
-    for (const HloSharding& element : tuple_elements_) {
-      *result.add_tuple_shardings() = element.ToProto();
-    }
-    result.set_type(OpSharding::Type::OpSharding_Type_TUPLE);
-    return result;
-  }
-
   *result.mutable_tile_shape() = tile_shape_;
   for (int64 dim : tile_assignment_.dimensions()) {
     result.add_tile_assignment_dimensions(dim);
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
index f8ef2a3d05..d7ada30c70 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
-#include "tensorflow/compiler/xla/shape_tree.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/hash/hash.h"
@@ -68,18 +67,6 @@ class HloSharding {
   // `num_tiles` tiles.
   static HloSharding Tile1D(const Shape& input_shape, int64 num_tiles);
 
-  // Creates a new sharding for a tuple type. The given ShapeTree must have
-  // elements for every leaf shape contained in the tuple.
-  static HloSharding Tuple(const ShapeTree<HloSharding>& sub_shardings) {
-    std::vector<HloSharding> flattened_list;
-    flattened_list.reserve(
-        std::distance(sub_shardings.leaf_begin(), sub_shardings.leaf_end()));
-    for (const auto& index_to_sharding : sub_shardings.leaves()) {
-      flattened_list.push_back(index_to_sharding.second);
-    }
-    return HloSharding(flattened_list);
-  }
-
   // Create a new sharding from a protobuf OpSharding.
   static StatusOr<HloSharding> FromProto(const OpSharding& proto);
 
@@ -89,89 +76,47 @@ class HloSharding {
   // Validate that this sharding can be applied to a tensor with shape `shape`.
   Status Validate(const Shape& shape, int64 num_devices) const;
 
-  // Returns true if the sharding has tuple type.
-  bool IsTuple() const { return tuple_; }
-
   // Returns true if the sharding is trivial: replicate on all devices.
-  bool IsReplicated() const {
-    if (!IsTuple()) {
-      return replicated_;
-    }
-    return std::all_of(tuple_elements_.begin(), tuple_elements_.end(),
-                       [](const HloSharding& s) { return s.IsReplicated(); });
-  }
+  bool IsReplicated() const { return replicated_; }
 
   // Returns true if the tile size is the same as the input size.
-  bool IsTileMaximal() const {
-    if (!IsTuple()) {
-      return maximal_;
-    }
-    return std::all_of(tuple_elements_.begin(), tuple_elements_.end(),
-                       [](const HloSharding& s) { return s.IsTileMaximal(); });
-  }
+  bool IsTileMaximal() const { return maximal_; }
 
   // Returns true if the sharding defines an operation on the given device.
   bool UsesDevice(int64 device) const;
 
   // Returns the tile that should be executed on the given device.
-  // REQUIRES: !IsTuple()
   std::vector<int64> TileIndexForDevice(int64 device) const;
 
   // Returns the device that should execute the given tile.
   // It is an error to call this if is_replicated() is true.
-  // REQUIRES: !IsTuple()
   int64 DeviceForTileIndex(tensorflow::gtl::ArraySlice<int64> index) const;
 
   // Given a device ID, returns the offset within the input space of the
   // tile that should be executed on the given core. This returns the lower
   // extent of the tile in the input space.
-  // REQUIRES: !IsTuple()
   std::vector<int64> TileOffsetForDevice(int64 device) const;
 
   // Given a device ID, returns the limit within the input space of the
   // tile that should be executed on the given core. This returns the upper
   // extent of the tile in the input space.
-  // REQUIRES: !IsTuple()
   std::vector<int64> TileLimitForDevice(int64 device) const;
 
   // Returns the single device this op operates on.
-  // REQUIRES: !IsTuple&& !Replicated() && IsTileMaximal()
+  // Requires !Replicated() && IsTileMaximal().
   StatusOr<int64> UniqueDevice() const;
 
   // Returns true if this op only uses a single device.
-  bool HasUniqueDevice() const;
-
-  // Returns the ShapeTree containing the shardings for each element of this
-  // tuple. Only the leaf elements are populated. This creates a new ShapeTree
-  // object so is not cheap. REQUIRES: IsTuple()
-  ShapeTree<HloSharding> GetTupleShardingsAsShapeTree(
-      const Shape& tuple_shape) const {
-    ShapeTree<HloSharding> result(tuple_shape, HloSharding::Replicate());
-    CHECK_EQ(std::distance(result.leaf_begin(), result.leaf_end()),
-             tuple_elements_.size());
-    auto it = tuple_elements_.begin();
-    for (auto& index_to_sharding : result.leaves()) {
-      index_to_sharding.second = *it++;
-    }
-    return result;
-  }
+  bool HasUniqueDevice() const { return !IsReplicated() && IsTileMaximal(); }
 
   bool operator==(const HloSharding& other) const {
     return replicated_ == other.replicated_ && maximal_ == other.maximal_ &&
            protobuf_util::ProtobufEquals(tile_shape_, other.tile_shape_) &&
-           tile_assignment_ == other.tile_assignment_ &&
-           tuple_elements_ == other.tuple_elements_;
+           tile_assignment_ == other.tile_assignment_;
   }
   bool operator!=(const HloSharding& other) const { return !(*this == other); }
 
   size_t Hash() const {
-    if (!tuple_) {
-      size_t h = 0;
-      for (const auto& element : tuple_elements_) {
-        h = tensorflow::Hash64Combine(h, element.Hash());
-      }
-      return h;
-    }
     if (replicated_) {
       return 0;
     }
@@ -186,47 +131,33 @@ class HloSharding {
   }
 
   // Gets the tile shape.
-  // REQUIRES: !IsTileMaximal() && !IsTuple()
+  // It is an error to call this if IsTileMaximal() is true.
   const Shape& tile_shape() const { return tile_shape_; }
   // Gets the tile assignment tensor.
-  // REQUIRES: !IsReplicated() && !IsTuple()
+  // It is an error to call this if IsReplicated() is true.
   const Array<int64>& tile_assignment() const { return tile_assignment_; }
 
  private:
   HloSharding()
       : replicated_(true),
         maximal_(true),
-        tuple_(false),
         tile_shape_(),
         tile_assignment_({0}) {}
   explicit HloSharding(int64 device_id)
       : replicated_(false),
         maximal_(true),
-        tuple_(false),
         tile_shape_(),
         tile_assignment_({1}, device_id) {}
   HloSharding(const Shape& tile_shape, const Array<int64>& tile_assignment)
       : replicated_(false),
         maximal_(false),
-        tuple_(false),
         tile_shape_(tile_shape),
         tile_assignment_(tile_assignment) {}
-  HloSharding(const std::vector<HloSharding>& tuple_shardings)
-      : replicated_(false),
-        maximal_(false),
-        tuple_(true),
-        tile_assignment_({0}),
-        tuple_elements_(tuple_shardings) {}
 
   bool replicated_;
   bool maximal_;
-  bool tuple_;
   Shape tile_shape_;
   Array<int64> tile_assignment_;
-  // Only non-empty when tuple_ is true, but because empty tuples are allowed
-  // may also be empty even then. This is a flattened list of all the leaf
-  // shardings in a tuple shape, by pre-order walk (ShapeTree iterator order).
-  std::vector<HloSharding> tuple_elements_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
index 00ea38480e..d0a20471a0 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
@@ -132,29 +132,6 @@ TEST_F(HloShardingTest, Tile) {
   }
 }
 
-TEST_F(HloShardingTest, NestedTuple) {
-  // nested_tuple_shape = (f32[], (f32[3]), f32[4, 6])
-  Shape nested_tuple_shape = ShapeUtil::MakeTupleShape({
-      ShapeUtil::MakeShape(F32, {}),
-      ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {3})}),
-      ShapeUtil::MakeShape(F32, {4, 6}),
-  });
-
-  OpSharding proto;
-  proto.set_type(OpSharding::Type::OpSharding_Type_TUPLE);
-  *proto.add_tuple_shardings() = HloSharding::Replicate().ToProto();
-  *proto.add_tuple_shardings() = HloSharding::AssignDevice(0).ToProto();
-  *proto.add_tuple_shardings() = HloSharding::AssignDevice(1).ToProto();
-  HloSharding tuple_sharding =
-      HloSharding::FromProto(proto).ConsumeValueOrDie();
-
-  ShapeTree<HloSharding> shape_tree =
-      tuple_sharding.GetTupleShardingsAsShapeTree(nested_tuple_shape);
-  EXPECT_EQ(shape_tree.element({0}), HloSharding::Replicate());
-  EXPECT_EQ(shape_tree.element({1, 0}), HloSharding::AssignDevice(0));
-  EXPECT_EQ(shape_tree.element({2}), HloSharding::AssignDevice(1));
-}
-
 TEST_F(HloShardingTest, Hash) {
   auto hash_compare_equal = [](const HloSharding& a, const HloSharding& b) {
     if (a.Hash() != b.Hash()) {
@@ -207,51 +184,6 @@ TEST_F(HloShardingTest, Hash) {
                                               MakeArray({2, 2}, {0, 3, 1, 2}));
     EXPECT_FALSE(hash_compare_equal(sharding1, sharding2));
   }
-
-  HloSharding default_sharding = HloSharding::Replicate();
-  {
-    ShapeTree<HloSharding> shape_tree(ShapeUtil::MakeTupleShape({}),
-                                      default_sharding);
-    HloSharding sharding1 = HloSharding::Replicate();
-    HloSharding sharding2 = HloSharding::Tuple(shape_tree);
-    EXPECT_FALSE(hash_compare_equal(sharding1, sharding2));
-  }
-
-  {
-    ShapeTree<HloSharding> shape_tree(ShapeUtil::MakeTupleShape({}),
-                                      default_sharding);
-    HloSharding sharding1 = HloSharding::Tuple(shape_tree);
-    HloSharding sharding2 = HloSharding::Tuple(shape_tree);
-    EXPECT_TRUE(hash_compare_equal(sharding1, sharding2));
-  }
-
-  {
-    ShapeTree<HloSharding> shape_tree1(
-        ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {4})}),
-        default_sharding);
-    *shape_tree1.mutable_element({0}) = HloSharding::Replicate();
-    ShapeTree<HloSharding> shape_tree2(
-        ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {4})}),
-        default_sharding);
-    *shape_tree2.mutable_element({0}) = HloSharding::AssignDevice(0);
-    HloSharding sharding1 = HloSharding::Tuple(shape_tree1);
-    HloSharding sharding2 = HloSharding::Tuple(shape_tree2);
-    EXPECT_FALSE(hash_compare_equal(sharding1, sharding2));
-  }
-
-  {
-    ShapeTree<HloSharding> shape_tree1(
-        ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {4})}),
-        default_sharding);
-    *shape_tree1.mutable_element({0}) = HloSharding::AssignDevice(0);
-    ShapeTree<HloSharding> shape_tree2(
-        ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {4})}),
-        default_sharding);
-    *shape_tree2.mutable_element({0}) = HloSharding::AssignDevice(0);
-    HloSharding sharding1 = HloSharding::Tuple(shape_tree1);
-    HloSharding sharding2 = HloSharding::Tuple(shape_tree2);
-    EXPECT_TRUE(hash_compare_equal(sharding1, sharding2));
-  }
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index c938450891..c1aa655401 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -270,40 +270,12 @@ class ShapeVerifier : public DfsHloVisitor {
                                                     pad->padding_config()));
   }
 
-  Status HandleSend(HloInstruction* send) override {
-    TF_RET_CHECK(send->users().size() == 1);
-    const HloInstruction* send_done = send->users()[0];
-    TF_RET_CHECK(send_done->opcode() == HloOpcode::kSendDone);
-    TF_RETURN_IF_ERROR(CheckSameChannel(send, send_done));
-    return CheckShape(
-        send, ShapeUtil::MakeTupleShape(
-                  {send->operand(0)->shape(), ShapeUtil::MakeShape(U32, {})}));
-  }
-
-  Status HandleSendDone(HloInstruction* send_done) override {
-    TF_RET_CHECK(send_done->operands().size() == 1);
-    const HloInstruction* send = send_done->operand(0);
-    TF_RET_CHECK(send->opcode() == HloOpcode::kSend);
-    TF_RETURN_IF_ERROR(CheckSameChannel(send, send_done));
-    return CheckShape(send_done, ShapeUtil::MakeNil());
-  }
-
-  Status HandleRecv(HloInstruction* recv) override {
-    TF_RET_CHECK(recv->users().size() == 1);
-    const HloInstruction* recv_done = recv->users()[0];
-    TF_RET_CHECK(recv_done->opcode() == HloOpcode::kRecvDone);
-    TF_RETURN_IF_ERROR(CheckSameChannel(recv, recv_done));
-    return CheckShape(recv,
-                      ShapeUtil::MakeTupleShape(
-                          {recv_done->shape(), ShapeUtil::MakeShape(U32, {})}));
+  Status HandleSend(HloInstruction*) override {
+    return tensorflow::Status::OK();
   }
 
-  Status HandleRecvDone(HloInstruction* recv_done) override {
-    TF_RET_CHECK(recv_done->operands().size() == 1);
-    const HloInstruction* recv = recv_done->operand(0);
-    TF_RET_CHECK(recv->opcode() == HloOpcode::kRecv);
-    TF_RETURN_IF_ERROR(CheckSameChannel(recv, recv_done));
-    return CheckShape(recv_done, recv->shape().tuple_shapes(0));
+  Status HandleRecv(HloInstruction*) override {
+    return tensorflow::Status::OK();
   }
 
   Status HandleBatchNormTraining(HloInstruction* batch_norm_training) override {
@@ -393,19 +365,6 @@ class ShapeVerifier : public DfsHloVisitor {
                           instruction->opcode(), instruction->operands()));
   }
 
-  // Checks if the given two instructions shares the same channel id.
-  Status CheckSameChannel(const HloInstruction* instr1,
-                          const HloInstruction* instr2) {
-    if (instr1->channel_id() != instr2->channel_id()) {
-      return FailedPrecondition(
-          "Expected to have the same channel id, actual channel ids are: %s "
-          "(%lld), %s (%lld)",
-          instr1->ToString().c_str(), instr1->channel_id(),
-          instr2->ToString().c_str(), instr2->channel_id());
-    }
-    return tensorflow::Status::OK();
-  }
-
   // Returns the size of a Shape in bytes.
   const std::function<int64(const Shape&)> shape_size_fn_;
 };
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index dea47b1fd7..0d1b7bc109 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -113,9 +113,7 @@ namespace xla {
     case HloOpcode::kTrace:
     case HloOpcode::kWhile:
     case HloOpcode::kSend:
-    case HloOpcode::kSendDone:
     case HloOpcode::kRecv:
-    case HloOpcode::kRecvDone:
       return true;
   }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index 96f937caf9..86dee8462f 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -89,7 +89,7 @@ StatusOr<se::DeviceMemoryBase> InterpreterExecutable::ExecuteOnStream(
 
   uint64 start_micros = tensorflow::Env::Default()->NowMicros();
 
-  const HloComputation* computation = module().entry_computation();
+  HloComputation* computation = module().entry_computation();
   if (computation->num_parameters() != arguments.size()) {
     return tensorflow::errors::Internal(
         "Mismatch between argument count and graph parameter count.");
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index d51c0d1dfb..c39ff52230 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -131,10 +131,10 @@ TEST_F(LayoutAssignmentTest, FusionInstruction) {
   std::vector<std::initializer_list<int64>> minor_to_majors = {{0, 1}, {1, 0}};
   for (auto& minor_to_major : minor_to_majors) {
     auto builder = HloComputation::Builder(TestName());
-    auto constant_literal1 = Literal::CreateR2WithLayout<float>(
-        {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout(minor_to_major));
-    auto constant_literal2 = Literal::CreateR2WithLayout<float>(
-        {{5.0, 6.0}, {7.0, 8.0}}, LayoutUtil::MakeLayout(minor_to_major));
+    auto constant_literal1 = test_utils::CreateR2LiteralWithLayout<float>(
+        {{1.0, 2.0}, {3.0, 4.0}}, minor_to_major);
+    auto constant_literal2 = test_utils::CreateR2LiteralWithLayout<float>(
+        {{5.0, 6.0}, {7.0, 8.0}}, minor_to_major);
     Shape ashape = constant_literal1->shape();
 
     auto constant1 = builder.AddInstruction(
@@ -181,12 +181,12 @@ TEST_F(LayoutAssignmentTest, TupleLayout) {
   // Verify the layouts of a tuple are assigned properly (the element layouts
   // match their source).
   auto builder = HloComputation::Builder(TestName());
-  auto constant0 = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR2WithLayout<float>(
-          {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1}))));
-  auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR2WithLayout<float>(
-          {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({1, 0}))));
+  auto constant0 = builder.AddInstruction(HloInstruction::CreateConstant(
+      test_utils::CreateR2LiteralWithLayout<float>({{1.0, 2.0}, {3.0, 4.0}},
+                                                   {0, 1})));
+  auto constant1 = builder.AddInstruction(HloInstruction::CreateConstant(
+      test_utils::CreateR2LiteralWithLayout<float>({{1.0, 2.0}, {3.0, 4.0}},
+                                                   {1, 0})));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({constant0, constant1}));
 
@@ -218,12 +218,12 @@ TEST_F(LayoutAssignmentTest, TupleLayout) {
 TEST_F(LayoutAssignmentTest, TupleSelect) {
   // Verify layouts of a select with tuple operands is assigned properly.
   auto builder = HloComputation::Builder(TestName());
-  auto constant0 = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR2WithLayout<float>(
-          {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1}))));
-  auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR2WithLayout<float>(
-          {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({1, 0}))));
+  auto constant0 = builder.AddInstruction(HloInstruction::CreateConstant(
+      test_utils::CreateR2LiteralWithLayout<float>({{1.0, 2.0}, {3.0, 4.0}},
+                                                   {0, 1})));
+  auto constant1 = builder.AddInstruction(HloInstruction::CreateConstant(
+      test_utils::CreateR2LiteralWithLayout<float>({{1.0, 2.0}, {3.0, 4.0}},
+                                                   {1, 0})));
   auto tuple0 = builder.AddInstruction(
       HloInstruction::CreateTuple({constant0, constant1}));
   auto tuple1 = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index 8f24bb1718..075d4a1ab5 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -155,30 +155,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "vector_support_library",
-    srcs = ["vector_support_library.cc"],
-    hdrs = ["vector_support_library.h"],
-    deps = [
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
-        "@llvm//:core",
-    ],
-)
-
-cc_library(
-    name = "kernel_support_library",
-    srcs = ["kernel_support_library.cc"],
-    hdrs = ["kernel_support_library.h"],
-    deps = [
-        ":llvm_loop",
-        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
-        "//tensorflow/core:lib",
-        "@llvm//:core",
-    ],
-)
-
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
deleted file mode 100644
index 29cc0f81bd..0000000000
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
-
-#include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
-
-namespace xla {
-void KernelSupportLibrary::For(
-    tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
-    llvm::Value* step,
-    const std::function<void(llvm::Value*, bool)>& for_body_generator) {
-  If(ir_builder_->CreateICmpSLT(start, end), [&]() {
-    for_body_generator(start, /*is_first_iteration=*/true);
-    For(name, ir_builder_->CreateAdd(start, step), end, step,
-        [&](llvm::Value* iv) { for_body_generator(iv, false); });
-  });
-}
-
-void KernelSupportLibrary::For(
-    tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
-    llvm::Value* step, bool peel_first_iteration,
-    const std::function<void(llvm::Value*, llvm::Value*)>& for_body_generator) {
-  if (peel_first_iteration) {
-    For(name, start, end, step, true,
-        [&](llvm::Value* indvar, bool is_first_iteration) {
-          for_body_generator(indvar, ir_builder_->getInt1(is_first_iteration));
-        });
-  } else {
-    std::unique_ptr<llvm_ir::ForLoop> loop = llvm_ir::ForLoop::EmitForLoop(
-        name, start, end, step, ir_builder_,
-        /*prevent_unrolling=*/prevent_unrolling_,
-        /*prevent_vectorization=*/prevent_vectorization_);
-    ir_builder_->SetInsertPoint(&loop->GetBodyBasicBlock()->back());
-    for_body_generator(loop->GetIndVarValue(),
-                       /*is_first_iteration=*/ir_builder_->CreateICmpEQ(
-                           loop->GetIndVarValue(), start));
-    llvm_ir::SetToLastInsertPoint(loop->GetExitBasicBlock(), ir_builder_);
-  }
-}
-
-void KernelSupportLibrary::If(
-    llvm::Value* condition, const std::function<void()>& true_block_generator,
-    const std::function<void()>& false_block_generator) {
-  llvm_ir::LlvmIfData if_data =
-      llvm_ir::EmitIfThenElse(condition, "", ir_builder_);
-  ir_builder_->SetInsertPoint(&if_data.true_block->back());
-  true_block_generator();
-  ir_builder_->SetInsertPoint(&if_data.false_block->back());
-  false_block_generator();
-  llvm_ir::SetToLastInsertPoint(if_data.after_block, ir_builder_);
-}
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
deleted file mode 100644
index 9bafb7b577..0000000000
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_
-
-#include <string>
-
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Value.h"
-#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
-
-namespace xla {
-// A thin wrapper around llvm_loop.h to make code generating structured control
-// flow more readable.
-class KernelSupportLibrary {
- public:
-  // `ir_builder` is the llvm::IRBuilder instance used to generate LLVM IR.
-  // If `prevent_unrolling` is true then unrolling is explicitly disabled on
-  // every loop generated by this instance of KernelSupportLibrary.
-  explicit KernelSupportLibrary(llvm::IRBuilder<>* ir_builder,
-                                bool prevent_unrolling = true,
-                                bool prevent_vectorization = true)
-      : ir_builder_(ir_builder),
-        prevent_unrolling_(prevent_unrolling),
-        prevent_vectorization_(prevent_vectorization) {}
-
-  // Generates the following control flow structure:
-  //
-  //   if (`start` < `end`) {
-  //     `for_body_generator(/*ind_var=*/start, /*is_first_iteration=*/true)`;
-  //     for (i64 i = `start` + `step`; i s< `end`; i += `step`)
-  //       `for_body_generator(/*ind_var=*/,i, /*is_first_iteration=*/false)`;
-  //   }
-  void For(
-      tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
-      llvm::Value* step,
-      const std::function<void(llvm::Value* ind_var, bool is_first_iteration)>&
-          for_body_generator);
-
-  void For(
-      tensorflow::StringPiece name, int64 start, int64 end, int64 step,
-      const std::function<void(llvm::Value* ind_var, bool is_first_iteration)>&
-          for_body_generator) {
-    For(name, /*start=*/ir_builder_->getInt64(start),
-        /*end=*/ir_builder_->getInt64(end),
-        /*step=*/ir_builder_->getInt64(step), for_body_generator);
-  }
-
-  // Generates the following control flow structure if `peel_first_iteration` is
-  // true:
-  //
-  //   if (`start` < `end`) {
-  //     `for_body_generator(/*ind_var=*/start, /*is_first_iteration=*/,true)`;
-  //     for (i64 i = `start` + `step`; i s< `end`; i += `step`)
-  //       `for_body_generator(/*ind_var=*/,i, /*is_first_iteration=*/,false)`;
-  //   }
-  //
-  // and the following if `peel_first_iteration` is false:
-  //
-  //   for (i64 i = `start`; i s< `end`; i += `step`)
-  //     `for_body_generator(/*ind_var=*/,i,
-  //                         /*is_first_iteration=*/,(i != `start`))`;
-  void For(tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
-           llvm::Value* step, bool peel_first_iteration,
-           const std::function<void(llvm::Value* ind_var,
-                                    llvm::Value* is_first_iteration)>&
-               for_body_generator);
-
-  void For(tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
-           int64 step, bool peel_first_iteration,
-           const std::function<void(llvm::Value* ind_var,
-                                    llvm::Value* is_first_iteration)>&
-               for_body_generator) {
-    For(name, /*start=*/start, /*end=*/end,
-        /*step=*/ir_builder_->getInt64(step), peel_first_iteration,
-        for_body_generator);
-  }
-
-  void For(
-      tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
-      llvm::Value* step,
-      const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
-    For(name, start, end, step,
-        /*peel_first_iteration=*/false,
-        [&](llvm::Value* indvar, llvm::Value*) { for_body_generator(indvar); });
-  }
-
-  void For(
-      tensorflow::StringPiece name, int64 start, int64 end, int64 step,
-      const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
-    For(name, /*start=*/ir_builder_->getInt64(start),
-        /*end=*/ir_builder_->getInt64(end),
-        /*step=*/ir_builder_->getInt64(step), for_body_generator);
-  }
-
-  // Generates the following control flow structure:
-  //
-  //   if (`condition`)
-  //     `true_block_generator()`;
-  //   else
-  //      `false_block_generator()`;
-  void If(llvm::Value* condition,
-          const std::function<void()>& true_block_generator,
-          const std::function<void()>& false_block_generator = []() {});
-
- private:
-  llvm::IRBuilder<>* ir_builder_;
-  bool prevent_unrolling_;
-  bool prevent_vectorization_;
-};
-}  // namespace xla
-
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
index 7b227ce294..83d35cb9ef 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
@@ -34,24 +34,21 @@ namespace llvm_ir {
 
 ForLoop::ForLoop(tensorflow::StringPiece prefix, tensorflow::StringPiece suffix,
                  llvm::Value* start_index, llvm::Value* end_index,
-                 llvm::Value* step, bool prevent_unrolling,
-                 bool prevent_vectorization)
+                 llvm::Value* step, bool prevent_unrolling)
     : prefix_(prefix.ToString()),
       suffix_(suffix.ToString()),
       start_index_(start_index),
       end_index_(end_index),
       step_(step),
       insert_before_bb_(nullptr),
-      prevent_unrolling_(prevent_unrolling),
-      prevent_vectorization_(prevent_vectorization) {}
+      prevent_unrolling_(prevent_unrolling) {}
 
 /* static */ std::unique_ptr<ForLoop> ForLoop::EmitForLoop(
     tensorflow::StringPiece prefix, llvm::Value* start_index,
     llvm::Value* end_index, llvm::Value* step, llvm::IRBuilder<>* ir_builder,
-    bool prevent_unrolling, bool prevent_vectorization) {
-  std::unique_ptr<ForLoop> loop(new ForLoop(prefix, /*suffix=*/"", start_index,
-                                            end_index, step, prevent_unrolling,
-                                            prevent_vectorization));
+    bool prevent_unrolling) {
+  std::unique_ptr<ForLoop> loop(new ForLoop(
+      prefix, /*suffix=*/"", start_index, end_index, step, prevent_unrolling));
   loop->Emit(ir_builder);
   return loop;
 }
@@ -130,12 +127,14 @@ void ForLoop::Emit(llvm::IRBuilder<>* ir_builder) {
   ir_builder->CreateStore(indvar_inc, indvar_address);
   llvm::BranchInst* back_branch = ir_builder->CreateBr(header_bb_);
 
-  std::vector<llvm::Metadata*> loop_metadata = GetLoopMetadata(ir_builder);
-  if (!loop_metadata.empty()) {
-    llvm::LLVMContext* ctx = &start_index_->getContext();
+  if (prevent_unrolling_) {
+    const char* const kLlvmLoopUnrollDisableMDName = "llvm.loop.unroll.disable";
+    llvm::LLVMContext* ctx = &back_branch->getContext();
+
     auto temp_node = llvm::MDNode::getTemporary(*ctx, llvm::None);
-    loop_metadata.insert(loop_metadata.begin(), temp_node.get());
-    auto loop_id = llvm::MDNode::get(*ctx, loop_metadata);
+    auto no_unroll_node = llvm::MDNode::get(
+        *ctx, {llvm::MDString::get(*ctx, kLlvmLoopUnrollDisableMDName)});
+    auto loop_id = llvm::MDNode::get(*ctx, {temp_node.get(), no_unroll_node});
     loop_id->replaceOperandWith(0, loop_id);
     back_branch->setMetadata(llvm::LLVMContext::MD_loop, loop_id);
   }
@@ -144,27 +143,6 @@ void ForLoop::Emit(llvm::IRBuilder<>* ir_builder) {
   ir_builder->SetInsertPoint(exit_bb_);
 }
 
-std::vector<llvm::Metadata*> ForLoop::GetLoopMetadata(
-    llvm::IRBuilder<>* ir_builder) {
-  const char* const kLlvmLoopUnrollDisableMDName = "llvm.loop.unroll.disable";
-  const char* const kLlvmLoopVectorizeMDName = "llvm.loop.vectorize.enable";
-  llvm::LLVMContext* ctx = &start_index_->getContext();
-
-  std::vector<llvm::Metadata*> result;
-  if (prevent_unrolling_) {
-    result.push_back(llvm::MDNode::get(
-        *ctx, {llvm::MDString::get(*ctx, kLlvmLoopUnrollDisableMDName)}));
-  }
-
-  if (prevent_vectorization_) {
-    result.push_back(llvm::MDNode::get(
-        *ctx, {llvm::MDString::get(*ctx, kLlvmLoopVectorizeMDName),
-               llvm::ConstantAsMetadata::get(ir_builder->getFalse())}));
-  }
-
-  return result;
-}
-
 string ForLoop::GetQualifiedName(tensorflow::StringPiece name) {
   return llvm_ir::IrName(prefix_, llvm_ir::IrName(name, suffix_));
 }
@@ -178,25 +156,23 @@ llvm::BasicBlock* ForLoop::CreateLoopBB(tensorflow::StringPiece name,
 std::unique_ptr<ForLoop> ForLoopNest::AddLoop(tensorflow::StringPiece suffix,
                                               llvm::Value* start_index,
                                               llvm::Value* end_index,
-                                              bool prevent_unrolling,
-                                              bool prevent_vectorization) {
+                                              bool prevent_unrolling) {
   return AddLoop(suffix, start_index, end_index, ir_builder_->getInt64(1),
-                 prevent_unrolling, prevent_vectorization);
+                 prevent_unrolling);
 }
 
 std::unique_ptr<ForLoop> ForLoopNest::AddLoop(tensorflow::StringPiece suffix,
                                               llvm::Value* start_index,
                                               llvm::Value* end_index,
                                               llvm::Value* stride,
-                                              bool prevent_unrolling,
-                                              bool prevent_vectorization) {
+                                              bool prevent_unrolling) {
   if (inner_loop_body_bb_ != nullptr) {
     // Create this loop inside the previous one.
     ir_builder_->SetInsertPoint(&*inner_loop_body_bb_->getFirstInsertionPt());
   }
   std::unique_ptr<ForLoop> loop(new ForLoop(
       /*prefix=*/name_, suffix, start_index, end_index, stride,
-      prevent_unrolling, prevent_vectorization));
+      prevent_unrolling));
   loop->Emit(ir_builder_);
 
   if (outer_loop_preheader_bb_ == nullptr) {
@@ -215,24 +191,20 @@ std::unique_ptr<ForLoop> ForLoopNest::AddLoop(tensorflow::StringPiece suffix,
 std::unique_ptr<ForLoop> ForLoopNest::AddLoop(int64 start_index,
                                               int64 end_index,
                                               tensorflow::StringPiece suffix,
-                                              bool prevent_unrolling,
-                                              bool prevent_vectorization) {
+                                              bool prevent_unrolling) {
   CHECK_LE(start_index, end_index);
   return AddLoop(suffix, ir_builder_->getInt64(start_index),
-                 ir_builder_->getInt64(end_index), prevent_unrolling,
-                 prevent_vectorization);
+                 ir_builder_->getInt64(end_index), prevent_unrolling);
 }
 
 std::unique_ptr<ForLoop> ForLoopNest::AddLoop(int64 start_index,
                                               int64 end_index, int64 stride,
                                               tensorflow::StringPiece suffix,
-                                              bool prevent_unrolling,
-                                              bool prevent_vectorization) {
+                                              bool prevent_unrolling) {
   CHECK_LE(start_index, end_index);
   return AddLoop(suffix, ir_builder_->getInt64(start_index),
                  ir_builder_->getInt64(end_index),
-                 ir_builder_->getInt64(stride), prevent_unrolling,
-                 prevent_vectorization);
+                 ir_builder_->getInt64(stride), prevent_unrolling);
 }
 
 IrArray::Index ForLoopNest::AddLoopsForShape(const Shape& shape,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
index 20069ce5a2..90f7c7df9e 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
@@ -71,10 +71,12 @@ class ForLoop {
   //
   // If `prevent_unrolling` is true then emit metadata that directs LLVM to not
   // unroll the generated loop.
-  static std::unique_ptr<ForLoop> EmitForLoop(
-      tensorflow::StringPiece prefix, llvm::Value* start_index,
-      llvm::Value* end_index, llvm::Value* step, llvm::IRBuilder<>* ir_builder,
-      bool prevent_unrolling = false, bool prevent_vectorization = false);
+  static std::unique_ptr<ForLoop> EmitForLoop(tensorflow::StringPiece prefix,
+                                              llvm::Value* start_index,
+                                              llvm::Value* end_index,
+                                              llvm::Value* step,
+                                              llvm::IRBuilder<>* ir_builder,
+                                              bool prevent_unrolling = false);
 
   // The names of the blocks follow LLVM's conventions. Control flow amongst the
   // blocks for the example C code looks like:
@@ -128,7 +130,7 @@ class ForLoop {
 
   ForLoop(tensorflow::StringPiece prefix, tensorflow::StringPiece suffix,
           llvm::Value* start_index, llvm::Value* end_index, llvm::Value* step,
-          bool prevent_unrolling, bool prevent_vectorization);
+          bool prevent_unrolling);
 
   // Emit the loop at the insert point of the builder.
   void Emit(llvm::IRBuilder<>* ir_builder);
@@ -140,10 +142,6 @@ class ForLoop {
   // they are set.
   string GetQualifiedName(tensorflow::StringPiece name);
 
-  // Return a list of metadata nodes that should be associated with the
-  // llvm::Loop for this `ForLoop`.
-  std::vector<llvm::Metadata*> GetLoopMetadata(llvm::IRBuilder<>* ir_builder);
-
   string prefix_;
   string suffix_;
   llvm::Value* start_index_;
@@ -162,7 +160,6 @@ class ForLoop {
   llvm::BasicBlock* exit_bb_;
   llvm::Value* indvar_;
   bool prevent_unrolling_;
-  bool prevent_vectorization_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(ForLoop);
 };
@@ -188,28 +185,24 @@ class ForLoopNest {
   std::unique_ptr<ForLoop> AddLoop(tensorflow::StringPiece suffix,
                                    llvm::Value* start_index,
                                    llvm::Value* end_index, llvm::Value* stride,
-                                   bool prevent_unrolling = false,
-                                   bool prevent_vectorization = false);
+                                   bool prevent_unrolling = false);
 
   // Like the above, except that it defaults to a stride of one.
   std::unique_ptr<ForLoop> AddLoop(tensorflow::StringPiece suffix,
                                    llvm::Value* start_index,
                                    llvm::Value* end_index,
-                                   bool prevent_unrolling = false,
-                                   bool prevent_vectorization = false);
+                                   bool prevent_unrolling = false);
 
   // A convenient wrapper of the other flavor of AddLoop. The given start and
   // end index are constant.
   std::unique_ptr<ForLoop> AddLoop(int64 start_index, int64 end_index,
                                    int64 stride, tensorflow::StringPiece suffix,
-                                   bool prevent_unrolling = false,
-                                   bool prevent_vectorization = false);
+                                   bool prevent_unrolling = false);
 
   // Like the above, except that it defaults to a stride of one.
   std::unique_ptr<ForLoop> AddLoop(int64 start_index, int64 end_index,
                                    tensorflow::StringPiece suffix,
-                                   bool prevent_unrolling = false,
-                                   bool prevent_vectorization = false);
+                                   bool prevent_unrolling = false);
 
   // Add loops to iterate through the indices within the specified
   // shape. The returned index collects the induction variables of the
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index d95409e399..956c0d5f05 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -537,14 +537,6 @@ void SetToFirstInsertPoint(llvm::BasicBlock* blk, llvm::IRBuilder<>* builder) {
   builder->SetInsertPoint(blk, blk->getFirstInsertionPt());
 }
 
-void SetToLastInsertPoint(llvm::BasicBlock* blk, llvm::IRBuilder<>* builder) {
-  if (llvm::Instruction* terminator = blk->getTerminator()) {
-    builder->SetInsertPoint(terminator);
-  } else {
-    builder->SetInsertPoint(blk);
-  }
-}
-
 llvm::Value* CreateRor(llvm::Value* rotand, llvm::Value* rotor,
                        llvm::IRBuilder<>* builder) {
   auto size = rotand->getType()->getPrimitiveSizeInBits();
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
index f70d9f88b3..304192b58e 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
@@ -243,8 +243,6 @@ llvm::Instruction* AddRangeMetadata(int64 lower, int64 upper,
 
 void SetToFirstInsertPoint(llvm::BasicBlock* blk, llvm::IRBuilder<>* builder);
 
-void SetToLastInsertPoint(llvm::BasicBlock* blk, llvm::IRBuilder<>* builder);
-
 // Create a bitwise rotation of `rotand` by `rotor`.
 llvm::Value* CreateRor(llvm::Value* rotand, llvm::Value* rotor,
                        llvm::IRBuilder<>* builder);
diff --git a/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.cc b/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.cc
deleted file mode 100644
index e8c6a83618..0000000000
--- a/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.cc
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/llvm_ir/vector_support_library.h"
-
-#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
-
-namespace xla {
-VectorSupportLibrary::VectorSupportLibrary(PrimitiveType primitive_type,
-                                           int64 vector_size,
-                                           llvm::IRBuilder<>* ir_builder,
-                                           std::string name)
-    : vector_size_(vector_size),
-      primitive_type_(primitive_type),
-      ir_builder_(ir_builder),
-      name_(std::move(name)) {
-  scalar_type_ = llvm_ir::PrimitiveTypeToIrType(
-      primitive_type, ir_builder_->GetInsertBlock()->getModule());
-  scalar_pointer_type_ = llvm::PointerType::getUnqual(scalar_type_);
-  vector_type_ = llvm::VectorType::get(scalar_type_, vector_size);
-  vector_pointer_type_ = llvm::PointerType::getUnqual(vector_type_);
-}
-
-llvm::Value* VectorSupportLibrary::Mul(llvm::Value* lhs, llvm::Value* rhs) {
-  if (scalar_type_->isFloatingPointTy()) {
-    return ir_builder()->CreateFMul(lhs, rhs, name());
-  } else {
-    return ir_builder()->CreateMul(lhs, rhs, name());
-  }
-}
-
-llvm::Value* VectorSupportLibrary::Add(llvm::Value* lhs, llvm::Value* rhs) {
-  if (scalar_type_->isFloatingPointTy()) {
-    return ir_builder()->CreateFAdd(lhs, rhs, name());
-  } else {
-    return ir_builder()->CreateAdd(lhs, rhs, name());
-  }
-}
-
-llvm::Value* VectorSupportLibrary::ComputeOffsetPointer(
-    llvm::Value* base_pointer, llvm::Value* offset_elements) {
-  if (base_pointer->getType() != scalar_pointer_type()) {
-    base_pointer = ir_builder()->CreateBitCast(base_pointer,
-                                               scalar_pointer_type(), name());
-  }
-  return ir_builder()->CreateInBoundsGEP(base_pointer, {offset_elements},
-                                         name());
-}
-
-llvm::Value* VectorSupportLibrary::LoadVector(llvm::Value* pointer) {
-  if (pointer->getType() != vector_pointer_type()) {
-    pointer =
-        ir_builder()->CreateBitCast(pointer, vector_pointer_type(), name());
-  }
-  return ir_builder()->CreateAlignedLoad(
-      pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_), name());
-}
-
-llvm::Value* VectorSupportLibrary::LoadScalar(llvm::Value* pointer) {
-  if (pointer->getType() != scalar_pointer_type()) {
-    pointer =
-        ir_builder()->CreateBitCast(pointer, scalar_pointer_type(), name());
-  }
-  return ir_builder()->CreateAlignedLoad(
-      pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_), name());
-}
-
-void VectorSupportLibrary::StoreVector(llvm::Value* value,
-                                       llvm::Value* pointer) {
-  if (pointer->getType() != vector_pointer_type()) {
-    pointer = ir_builder()->CreateBitCast(pointer, vector_pointer_type());
-  }
-  ir_builder()->CreateAlignedStore(
-      value, pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_));
-}
-
-void VectorSupportLibrary::StoreScalar(llvm::Value* value,
-                                       llvm::Value* pointer) {
-  if (pointer->getType() != scalar_pointer_type()) {
-    pointer =
-        ir_builder()->CreateBitCast(pointer, scalar_pointer_type(), name());
-  }
-  ir_builder()->CreateAlignedStore(
-      value, pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_));
-}
-
-llvm::Value* VectorSupportLibrary::LoadBroadcast(llvm::Value* pointer) {
-  if (pointer->getType() != scalar_pointer_type()) {
-    pointer =
-        ir_builder()->CreateBitCast(pointer, scalar_pointer_type(), name());
-  }
-  return ir_builder()->CreateVectorSplat(
-      vector_size(), ir_builder()->CreateLoad(pointer), name());
-}
-
-llvm::Value* VectorSupportLibrary::AddReduce(llvm::Value* vector) {
-  llvm::SmallVector<llvm::Constant*, 32> mask(vector_size(), nullptr);
-  for (unsigned i = vector_size(); i != 1; i >>= 1) {
-    // On every iteration, we shuffle half of the remaining lanes to the top
-    // half of shuffle, and add two old and the new vector.
-
-    for (unsigned j = 0; j < vector_size(); ++j) {
-      if (j < (i / 2)) {
-        mask[j] = ir_builder()->getInt32(i / 2 + j);
-      } else {
-        mask[j] = llvm::UndefValue::get(ir_builder()->getInt32Ty());
-      }
-    }
-
-    llvm::Value* half_remaining_lanes = ir_builder()->CreateShuffleVector(
-        vector, llvm::UndefValue::get(vector_type()),
-        llvm::ConstantVector::get(mask), "");
-    vector = Add(vector, half_remaining_lanes);
-  }
-
-  return ir_builder()->CreateExtractElement(vector, ir_builder()->getInt32(0),
-                                            name());
-}
-
-llvm::Value* VectorSupportLibrary::GetZeroVector() {
-  return llvm::Constant::getNullValue(vector_type());
-}
-
-llvm::Value* VectorSupportLibrary::GetZeroScalar() {
-  return llvm::Constant::getNullValue(scalar_type());
-}
-
-LlvmVariable::LlvmVariable(llvm::Type* type, llvm::IRBuilder<>* ir_builder)
-    : ir_builder_(ir_builder) {
-  alloca_ = llvm_ir::EmitAllocaAtFunctionEntry(type, "", ir_builder_);
-}
-
-llvm::Value* LlvmVariable::Get() { return ir_builder_->CreateLoad(alloca_); }
-
-void LlvmVariable::Set(llvm::Value* new_value) {
-  ir_builder_->CreateStore(new_value, alloca_);
-}
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.h b/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.h
deleted file mode 100644
index 3072677ab0..0000000000
--- a/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_VECTOR_SUPPORT_LIBRARY_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_VECTOR_SUPPORT_LIBRARY_H_
-
-#include <string>
-
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Value.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-
-namespace xla {
-// A thin wrapper around llvm_util.h to make code generating vector math flow
-// more readable.
-class VectorSupportLibrary {
- public:
-  // This VectorSupportLibrary instance remembers `primitive_type` and
-  // `vector_size`, and these are implicitly used by the methods on this
-  // instance (i.e. LoadVector will load a vector of type <`vector_size` x
-  // `primitive_type`>).
-  VectorSupportLibrary(PrimitiveType primitive_type, int64 vector_size,
-                       llvm::IRBuilder<>* ir_builder, std::string name);
-
-  llvm::Value* Mul(llvm::Value* lhs, llvm::Value* rhs);
-  llvm::Value* Mul(int64 lhs, llvm::Value* rhs) {
-    return Mul(ir_builder()->getInt64(lhs), rhs);
-  }
-
-  llvm::Value* Add(llvm::Value* lhs, llvm::Value* rhs);
-  llvm::Value* Add(int64 lhs, llvm::Value* rhs) {
-    return Add(ir_builder()->getInt64(lhs), rhs);
-  }
-
-  llvm::Value* MulAdd(llvm::Value* a, llvm::Value* b, llvm::Value* c) {
-    return Add(c, Mul(a, b));
-  }
-
-  llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
-                                    llvm::Value* offset_elements);
-  llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
-                                    int64 offset_elements) {
-    return ComputeOffsetPointer(base_pointer,
-                                ir_builder()->getInt64(offset_elements));
-  }
-
-  llvm::Value* LoadVector(llvm::Value* pointer);
-
-  llvm::Value* LoadVector(llvm::Value* base_pointer,
-                          llvm::Value* offset_elements) {
-    return LoadVector(ComputeOffsetPointer(base_pointer, offset_elements));
-  }
-
-  llvm::Value* LoadVector(llvm::Value* base_pointer, int64 offset_elements) {
-    return LoadVector(base_pointer, ir_builder()->getInt64(offset_elements));
-  }
-
-  llvm::Value* LoadScalar(llvm::Value* pointer);
-
-  llvm::Value* LoadScalar(llvm::Value* base_pointer,
-                          llvm::Value* offset_elements) {
-    return LoadScalar(ComputeOffsetPointer(base_pointer, offset_elements));
-  }
-
-  llvm::Value* LoadScalar(llvm::Value* base_pointer, int64 offset_elements) {
-    return LoadScalar(base_pointer, ir_builder()->getInt64(offset_elements));
-  }
-
-  void StoreVector(llvm::Value* value, llvm::Value* pointer);
-
-  void StoreVector(llvm::Value* value, llvm::Value* base_pointer,
-                   llvm::Value* offset_elements) {
-    StoreVector(value, ComputeOffsetPointer(base_pointer, offset_elements));
-  }
-
-  void StoreVector(llvm::Value* value, llvm::Value* base_pointer,
-                   int64 offset_elements) {
-    StoreVector(value, base_pointer, ir_builder()->getInt64(offset_elements));
-  }
-
-  void StoreScalar(llvm::Value* value, llvm::Value* pointer);
-  void StoreScalar(llvm::Value* value, llvm::Value* base_pointer,
-                   llvm::Value* offset_elements) {
-    StoreScalar(value, ComputeOffsetPointer(base_pointer, offset_elements));
-  }
-
-  void StoreScalar(llvm::Value* value, llvm::Value* base_pointer,
-                   int64 offset_elements) {
-    StoreScalar(base_pointer, ir_builder()->getInt64(offset_elements));
-  }
-
-  llvm::Value* LoadBroadcast(llvm::Value* pointer);
-  llvm::Value* LoadBroadcast(llvm::Value* base_pointer,
-                             llvm::Value* offset_elements) {
-    return LoadBroadcast(ComputeOffsetPointer(base_pointer, offset_elements));
-  }
-  llvm::Value* LoadBroadcast(llvm::Value* base_pointer, int64 offset_elements) {
-    return LoadBroadcast(base_pointer, ir_builder()->getInt64(offset_elements));
-  }
-
-  llvm::Value* AddReduce(llvm::Value* vector);
-
-  llvm::Value* GetZeroVector();
-  llvm::Value* GetZeroScalar();
-
-  llvm::IRBuilder<>* ir_builder() const { return ir_builder_; }
-  int64 vector_size() const { return vector_size_; }
-  llvm::Type* vector_type() const { return vector_type_; }
-  llvm::Type* vector_pointer_type() const { return vector_pointer_type_; }
-  llvm::Type* scalar_type() const { return scalar_type_; }
-  llvm::Type* scalar_pointer_type() const { return scalar_pointer_type_; }
-
-  const std::string& name() const { return name_; }
-
- private:
-  int64 vector_size_;
-  PrimitiveType primitive_type_;
-  llvm::IRBuilder<>* ir_builder_;
-  llvm::Type* vector_type_;
-  llvm::Type* vector_pointer_type_;
-  llvm::Type* scalar_type_;
-  llvm::Type* scalar_pointer_type_;
-  std::string name_;
-};
-
-// This wraps an alloca-backed stack variable which LLVM's SSA construction pass
-// can later convert to a SSA value.
-class LlvmVariable {
- public:
-  LlvmVariable(llvm::Type*, llvm::IRBuilder<>* ir_builder);
-
-  llvm::Value* Get();
-  void Set(llvm::Value* new_value);
-
- private:
-  llvm::AllocaInst* alloca_;
-  llvm::IRBuilder<>* ir_builder_;
-};
-
-class VectorVariable : public LlvmVariable {
- public:
-  VectorVariable(VectorSupportLibrary* vector_support,
-                 llvm::Value* initial_value)
-      : LlvmVariable(vector_support->vector_type(),
-                     vector_support->ir_builder()) {
-    Set(initial_value);
-  }
-};
-
-class ScalarVariable : public LlvmVariable {
- public:
-  ScalarVariable(VectorSupportLibrary* vector_support,
-                 llvm::Value* initial_value)
-      : LlvmVariable(vector_support->scalar_type(),
-                     vector_support->ir_builder()) {
-    Set(initial_value);
-  }
-};
-}  // namespace xla
-
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_VECTOR_SUPPORT_LIBRARY_H_
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 06f43bd3cb..d4d35da9d6 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -68,6 +68,26 @@ LocalService::LocalService(const ServiceOptions& options,
                            std::unique_ptr<Backend> execute_backend)
     : Service(options, std::move(execute_backend)) {}
 
+namespace {
+// Returns the space required to allocate a shape. If
+// allocate_space_for_deep_copy the space includes all sub-buffers of
+// a tuple.
+int64 RequiredSpace(const Shape& shape, bool allocate_space_for_deep_copy,
+                    TransferManager* transfer_manager) {
+  int64 size = 0;
+  // TODO(b/33492279) remove once no devices represent result tuples as
+  // contiguous buffers.
+  if (allocate_space_for_deep_copy) {
+    ShapeUtil::ForEachSubshape(
+        shape, [&size, transfer_manager](const Shape& subshape,
+                                         const ShapeIndex& /*index*/) {
+          size += transfer_manager->GetByteSizeRequirement(subshape);
+        });
+  }
+  return size;
+}
+}  // namespace
+
 StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
     const ComputationHandle& computation,
     const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
diff --git a/tensorflow/compiler/xla/service/logical_buffer_analysis.cc b/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
index 02dc49e78c..b92017c6cb 100644
--- a/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
+++ b/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
@@ -104,21 +104,6 @@ Status LogicalBufferAnalysis::HandleBitcast(HloInstruction*) {
   return Status::OK();
 }
 
-Status LogicalBufferAnalysis::HandleRecvDone(HloInstruction*) {
-  // RecvDone doesn't create a new buffer but rather aliases its input (Recv)
-  // tuple element at {0} to its output.
-  return Status::OK();
-}
-
-Status LogicalBufferAnalysis::HandleSend(HloInstruction* send) {
-  // Send creates new buffers for the top-level tuple and the context (tuple
-  // element at {1}). Tuple element at {0} is an alias of the Send operand, so
-  // we don't need to create a new Logical Buffer for that.
-  NewLogicalBuffer(send, /*index=*/{});
-  NewLogicalBuffer(send, /*index=*/{1});
-  return Status::OK();
-}
-
 Status LogicalBufferAnalysis::HandleTuple(HloInstruction* tuple) {
   // A Tuple instruction only creates the top-level buffer.
   NewLogicalBuffer(tuple, /*index=*/{});
diff --git a/tensorflow/compiler/xla/service/logical_buffer_analysis.h b/tensorflow/compiler/xla/service/logical_buffer_analysis.h
index 598d08b720..a82e83ec5c 100644
--- a/tensorflow/compiler/xla/service/logical_buffer_analysis.h
+++ b/tensorflow/compiler/xla/service/logical_buffer_analysis.h
@@ -60,8 +60,6 @@ class LogicalBufferAnalysis : public DfsHloVisitorWithDefault {
   Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
   Status HandleBitcast(HloInstruction* bitcast) override;
   Status HandleCopy(HloInstruction* copy) override;
-  Status HandleRecvDone(HloInstruction* recv_done) override;
-  Status HandleSend(HloInstruction* send) override;
   Status HandleSelect(HloInstruction* select) override;
 
   // A map from the buffer ID to the logical buffer
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 47f4f0ade5..6646be2e9a 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -272,6 +272,8 @@ class Service : public ServiceInterface {
 
   // Create a Hlo module config for the given program shape and arguments.
   // execution_options is optional; if not given a default is used.
+  // has_hybrid_result is used to initialize the same-named field in
+  // HloModuleConfig -- see that class for documentation.
   StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
       const ProgramShape& program_shape,
       tensorflow::gtl::ArraySlice<const Shape*> argument_shapes,
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index dcd726f22c..791d17365b 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -771,12 +770,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs));
 
-  TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(
-      lhs, tensorflow::strings::StrCat("lhs of binary operation ",
-                                       BinaryOperation_Name(operation))));
-  TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(
-      rhs, tensorflow::strings::StrCat("rhs of binary operation ",
-                                       BinaryOperation_Name(operation))));
+  TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(lhs, "lhs of binary operation"));
+  TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(rhs, "rhs of binary operation"));
   switch (operation) {
     case BINOP_DOT:
       return InferDotOpShape(lhs, rhs);
@@ -1948,10 +1943,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       !std::is_permutation(dimensions.begin(), dimensions.end(),
                            indices.begin())) {
     return InvalidArgument(
-        "Reshape dimensions [%s] are not a permutation of the operand "
-        "dimensions (operand shape is %s).",
-        tensorflow::str_util::Join(dimensions, ",").c_str(),
-        ShapeUtil::HumanString(operand).c_str());
+        "Reshape dimensions not a permutation of the operand dimensions.");
   }
 
   return inferred_shape;
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index a1f9451dd4..df537bd7c1 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -253,64 +253,6 @@ Status TuplePointsToAnalysis::HandleBitcast(HloInstruction* bitcast) {
   return Status::OK();
 }
 
-Status TuplePointsToAnalysis::HandleRecvDone(HloInstruction* recv_done) {
-  // RecvDone aliases its input (Recv) tuple element {0} to its output.
-  PointsToSet& points_to_set = CreateEmptyPointsToSet(recv_done);
-  const PointsToSet& operand_points_to_set =
-      GetPointsToSet(recv_done->operand(0));
-
-  // Recursively copy the points to set of the operand tuple {0}.
-  points_to_set.ForEachMutableElement(
-      [this, &points_to_set, &operand_points_to_set](
-          const ShapeIndex& index, PointsToSet::BufferList* buffers) {
-        ShapeIndex src_index({0});
-        for (auto element : index) {
-          src_index.push_back(element);
-        }
-        *buffers = operand_points_to_set.element(src_index);
-        for (auto& tuple_source :
-             operand_points_to_set.tuple_sources(src_index)) {
-          points_to_set.add_tuple_source(index, tuple_source);
-        }
-      });
-  return Status::OK();
-}
-
-Status TuplePointsToAnalysis::HandleSend(HloInstruction* send) {
-  // Send creates a tuple of {aliased operand, U32 context}.
-  PointsToSet& points_to_set = CreateEmptyPointsToSet(send);
-
-  // Creates the points to set for the tuple and its element at {1}.
-  auto top_buffer = points_to_set.mutable_element(ShapeIndex({}));
-  top_buffer->push_back(
-      &logical_buffer_analysis_->GetBuffer(send, ShapeIndex({})));
-  points_to_set.add_tuple_source({}, send);
-
-  auto context_buffer = points_to_set.mutable_element(ShapeIndex({1}));
-  context_buffer->push_back(
-      &logical_buffer_analysis_->GetBuffer(send, ShapeIndex({1})));
-
-  // Recursively copy the points to set of the operand to output tuple {0}.
-  const PointsToSet& operand_points_to_set = GetPointsToSet(send->operand(0));
-  operand_points_to_set.ForEachElement(
-      [&points_to_set, &operand_points_to_set](
-          const ShapeIndex& src_index,
-          const PointsToSet::BufferList& points_to) {
-        ShapeIndex target_index({0});
-        for (auto element : src_index) {
-          target_index.push_back(element);
-        }
-        *points_to_set.mutable_element(target_index) = points_to;
-
-        for (HloInstruction* tuple :
-             operand_points_to_set.tuple_sources(src_index)) {
-          points_to_set.add_tuple_source(target_index, tuple);
-        }
-      });
-
-  return Status::OK();
-}
-
 Status TuplePointsToAnalysis::HandleTuple(HloInstruction* tuple) {
   tensorflow::gtl::ArraySlice<HloInstruction*> operands(tuple->operands());
   PointsToSet& points_to_set = CreateEmptyPointsToSet(tuple);
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
index 8928de107e..e6157a1ed1 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
@@ -251,8 +251,6 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
   Status HandleBitcast(HloInstruction* bitcast) override;
   Status HandleCopy(HloInstruction* copy) override;
-  Status HandleRecvDone(HloInstruction* recv_done) override;
-  Status HandleSend(HloInstruction* send) override;
   Status HandleSelect(HloInstruction* select) override;
 
   string ToString() const;
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index dec446d4da..694ed57fa2 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -313,51 +313,6 @@ TEST_F(TuplePointsToAnalysisTest, TupleCopy) {
       {constant1, constant2, copy});
 }
 
-TEST_F(TuplePointsToAnalysisTest, SendAndSendDone) {
-  // Send forwards its operand to the output tuple at {0}.
-  auto builder = HloComputation::Builder(TestName());
-  auto constant = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
-  auto send = builder.AddInstruction(
-      HloInstruction::CreateSend(constant, /*channel_id=*/0));
-  auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send));
-
-  BuildModuleAndRunAnalysis(builder.Build());
-
-  EXPECT_FALSE(points_to_analysis_->GetPointsToSet(send).IsAmbiguous());
-  EXPECT_TRUE(points_to_analysis_->GetPointsToSet(send).IsDistinct());
-  EXPECT_FALSE(points_to_analysis_->GetPointsToSet(send_done).IsAmbiguous());
-  EXPECT_TRUE(points_to_analysis_->GetPointsToSet(send_done).IsDistinct());
-
-  ExpectHasTopLevelBuffers(
-      points_to_analysis_->GetPointsToSet(send).element({}), {send});
-  ExpectHasTopLevelBuffers(
-      points_to_analysis_->GetPointsToSet(send).element({0}), {constant});
-  ExpectHasTopLevelBuffers(
-      points_to_analysis_->GetPointsToSet(send_done).CreateFlattenedSet(),
-      {send_done});
-  ExpectHasBufferAliases(constant, {}, {{constant, {}}, {send, {0}}});
-}
-
-TEST_F(TuplePointsToAnalysisTest, RecvAndRecvDone) {
-  // RecvDone forwards its operand tuple element at {0} to the output.
-  auto builder = HloComputation::Builder(TestName());
-  auto recv = builder.AddInstruction(HloInstruction::CreateRecv(
-      ShapeUtil::MakeShape(F32, {1, 2, 3}), /*channel_id=*/0));
-  auto recv_done = builder.AddInstruction(HloInstruction::CreateRecvDone(recv));
-
-  BuildModuleAndRunAnalysis(builder.Build());
-
-  EXPECT_FALSE(points_to_analysis_->GetPointsToSet(recv).IsAmbiguous());
-  EXPECT_TRUE(points_to_analysis_->GetPointsToSet(recv).IsDistinct());
-  EXPECT_FALSE(points_to_analysis_->GetPointsToSet(recv_done).IsAmbiguous());
-  EXPECT_TRUE(points_to_analysis_->GetPointsToSet(recv_done).IsDistinct());
-
-  ExpectHasTopLevelBuffers(
-      points_to_analysis_->GetPointsToSet(recv).element({}), {recv});
-  ExpectHasBufferAliases(recv, {0}, {{recv, {0}}, {recv_done, {}}});
-}
-
 TEST_F(TuplePointsToAnalysisTest, TupleSelect) {
   // Select from two different tuples. This should create an ambiguous points to
   // set containing the union of both sides.
diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
index 8d5bb08e51..e9d182509b 100644
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@@ -2927,9 +2927,8 @@ void ComputationLowerer::Visit(
 
     case OpRequest::kRecvRequest: {
       const RecvRequest& recv_request = request.request().recv_request();
-      HloInstruction* recv = add_instruction(HloInstruction::CreateRecv(
+      hlo_instruction = add_instruction(HloInstruction::CreateRecv(
           request.output_shape(), recv_request.channel_handle().handle()));
-      hlo_instruction = add_instruction(HloInstruction::CreateRecvDone(recv));
       break;
     }
 
@@ -3121,9 +3120,8 @@ void ComputationLowerer::Visit(
     case OpRequest::kSendRequest: {
       const SendRequest& send_request = request.request().send_request();
       HloInstruction* operand = lookup_instruction(send_request.operand());
-      HloInstruction* send = add_instruction(HloInstruction::CreateSend(
+      hlo_instruction = add_instruction(HloInstruction::CreateSend(
           operand, send_request.channel_handle().handle()));
-      hlo_instruction = add_instruction(HloInstruction::CreateSendDone(send));
       break;
     }
 
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index 2fac914892..65734f91bc 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -58,9 +58,7 @@ static bool ContainsSendOrRecv(const HloComputation* comp) {
 
 static bool IsOrContainsSendOrRecv(const HloInstruction* instr) {
   if (instr->opcode() == HloOpcode::kSend ||
-      instr->opcode() == HloOpcode::kSendDone ||
-      instr->opcode() == HloOpcode::kRecv ||
-      instr->opcode() == HloOpcode::kRecvDone) {
+      instr->opcode() == HloOpcode::kRecv) {
     return true;
   }
   for (const auto& subcomp : instr->called_computations()) {
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
index d99b31dc00..8e1a2dcde1 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
@@ -144,11 +144,10 @@ TEST_F(WhileLoopSimplifierTest, NotRemovedIfContainsSend) {
   auto* while_op = computation->root_instruction();
   ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
   auto* while_body = while_op->while_body();
-  auto* send = while_body->AddInstruction(HloInstruction::CreateSend(
+  while_body->AddInstruction(HloInstruction::CreateSend(
       while_body->AddInstruction(
           HloInstruction::CreateConstant(Literal::CreateR0<bool>(true))),
       /*channel_id=*/0));
-  while_body->AddInstruction(HloInstruction::CreateSendDone(send));
   EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
 }
 
@@ -157,10 +156,9 @@ TEST_F(WhileLoopSimplifierTest, NotRemovedIfContainsRecv) {
   auto* while_op = computation->root_instruction();
   ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
   auto* while_body = while_op->while_body();
-  auto* recv = while_body->AddInstruction(
+  while_body->AddInstruction(
       HloInstruction::CreateRecv(ShapeUtil::MakeShape(F32, {1}),
                                  /*channel_id=*/0));
-  while_body->AddInstruction(HloInstruction::CreateRecvDone(recv));
   EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
 }
 
diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h
index a898a4d375..64a36471b9 100644
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@@ -116,7 +116,6 @@ class ShapeTree {
   ShapeTree(const Shape* shape, const T& init_value);
 
   ShapeTree(const ShapeTree& other) { *this = other; }
-  ShapeTree(ShapeTree&&) = default;
 
   ShapeTree& operator=(const ShapeTree& other) {
     root_ = other.root_;
@@ -133,8 +132,6 @@ class ShapeTree {
     return *this;
   }
 
-  ShapeTree& operator=(ShapeTree&& other) = default;
-
   // Returns the data element associated with the array in the shape at the
   // given index (see ShapeUtil::GetSubshape for how indexes are defined).
   const T& element(const ShapeIndex& index) const;
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 4d0bafa908..b5eb81dfc6 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -263,7 +263,6 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
     case S32:
     case S64:
     case F16:
-    case BF16:
     case F32:
     case F64:
       return true;
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 3e62481629..4e1be24b61 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -61,14 +61,13 @@ generate_backend_test_macros()
 
 cc_library(
     name = "test_utils",
-    srcs = ["test_utils.cc"],
+    testonly = True,
     hdrs = ["test_utils.h"],
     deps = [
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:lib",
     ],
 )
@@ -1344,23 +1343,22 @@ xla_test(
     ],
 )
 
-tf_cc_test(
+xla_test(
     name = "llvm_compiler_test",
     srcs = ["llvm_compiler_test.cc"],
-    tags = ["requires-gpu-sm35"],
+    backends = [
+        "cpu",
+        "gpu",
+        "cpu_parallel",
+    ],
     deps = [
-        "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla/service:backend",
-        "//tensorflow/compiler/xla/service:cpu_plugin",
-        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:llvm_compiler",
-        "//tensorflow/compiler/xla/service:platform_util",
-        "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
-        "//tensorflow/compiler/xla/service/gpu:gpu_compiler",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/stream_executor",
         "@llvm//:core",
     ],
 )
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index ef54714e46..065bce7e31 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -346,60 +346,6 @@ void ClientLibraryTestBase::ComputeAndCompareTuple(
   LiteralTestUtil::ExpectNearTuple(expected, *actual, error);
 }
 
-void ClientLibraryTestBase::ComputeAndCompare(
-    ComputationBuilder* builder, const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<Literal> arguments) {
-  auto status_or_data = ComputeValueAndReference(builder, operand, arguments);
-  EXPECT_IS_OK(status_or_data);
-  if (!status_or_data.ok()) {
-    return;
-  }
-  std::unique_ptr<Literal> reference, result;
-  std::tie(reference, result) = status_or_data.ConsumeValueOrDie();
-  LiteralTestUtil::ExpectEqual(*reference, *result);
-}
-
-void ClientLibraryTestBase::ComputeAndCompare(
-    ComputationBuilder* builder, const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<Literal> arguments, ErrorSpec error) {
-  auto status_or_data = ComputeValueAndReference(builder, operand, arguments);
-  EXPECT_IS_OK(status_or_data);
-  if (!status_or_data.ok()) {
-    return;
-  }
-  std::unique_ptr<Literal> reference, result;
-  std::tie(reference, result) = status_or_data.ConsumeValueOrDie();
-  LiteralTestUtil::ExpectNear(*reference, *result, error);
-}
-
-StatusOr<std::pair<std::unique_ptr<Literal>, std::unique_ptr<Literal>>>
-ClientLibraryTestBase::ComputeValueAndReference(
-    ComputationBuilder* builder, const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<Literal> arguments) {
-  // Transfer the arguments to the executor service. We put the unique_ptr's
-  // into a vector to keep the data alive on the service until the end of this
-  // function.
-  std::vector<std::unique_ptr<GlobalData>> argument_data;
-  for (const auto& arg : arguments) {
-    TF_ASSIGN_OR_RETURN(auto data, client_->TransferToServer(arg));
-    argument_data.push_back(std::move(data));
-  }
-
-  // Create raw pointers to the GlobalData for the rest of the call stack.
-  std::vector<GlobalData*> argument_data_ptr;
-  std::transform(
-      argument_data.begin(), argument_data.end(),
-      std::back_inserter(argument_data_ptr),
-      [](const std::unique_ptr<GlobalData>& data) { return data.get(); });
-
-  TF_ASSIGN_OR_RETURN(
-      auto reference,
-      builder->ComputeConstant(operand, /*output_layout=*/nullptr, arguments));
-  TF_ASSIGN_OR_RETURN(auto result,
-                      ExecuteAndTransfer(builder, argument_data_ptr));
-  return std::make_pair(std::move(reference), std::move(result));
-}
-
 Computation ClientLibraryTestBase::CreateScalarRelu() {
   ComputationBuilder builder(client_, "relu");
   auto z_value = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "z_value");
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index b578667735..7cfc276ec1 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -196,16 +196,6 @@ class ClientLibraryTestBase : public ::testing::Test {
       ComputationBuilder* builder, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec abs_error);
 
-  // Convenience method for running a built computation and comparing the result
-  // with the HloEvaluator.
-  void ComputeAndCompare(ComputationBuilder* builder,
-                         const ComputationDataHandle& operand,
-                         tensorflow::gtl::ArraySlice<Literal> arguments);
-  void ComputeAndCompare(ComputationBuilder* builder,
-                         const ComputationDataHandle& operand,
-                         tensorflow::gtl::ArraySlice<Literal> arguments,
-                         ErrorSpec error);
-
   // Create scalar operations for use in reductions.
   Computation CreateScalarRelu();
   Computation CreateScalarMax();
@@ -308,13 +298,6 @@ class ClientLibraryTestBase : public ::testing::Test {
       const std::function<void(const Literal& actual,
                                const string& error_message)>& verify_output,
       const Shape* output_with_layout = nullptr);
-
-  // Executes the computation and calculates the expected reference value using
-  // the HloEvaluator. Returns two literal in the order of (expected, actual).
-  StatusOr<std::pair<std::unique_ptr<Literal>, std::unique_ptr<Literal>>>
-  ComputeValueAndReference(ComputationBuilder* builder,
-                           const ComputationDataHandle& operand,
-                           tensorflow::gtl::ArraySlice<Literal> arguments);
 };
 
 template <typename NativeT>
@@ -486,7 +469,8 @@ template <typename NativeT>
 std::vector<NativeT> ClientLibraryTestBase::CreatePseudorandomR1(
     const int width, NativeT min_value, NativeT max_value, uint32 seed) {
   std::vector<NativeT> result(width);
-  PseudorandomGenerator<NativeT> generator(min_value, max_value, seed);
+  test_utils::PseudorandomGenerator<NativeT> generator(min_value, max_value,
+                                                       seed);
   for (int i = 0; i < width; ++i) {
     result[i] = generator.get();
   }
@@ -498,7 +482,8 @@ std::unique_ptr<Array2D<NativeT>> ClientLibraryTestBase::CreatePseudorandomR2(
     const int rows, const int cols, NativeT min_value, NativeT max_value,
     uint32 seed) {
   auto result = MakeUnique<Array2D<NativeT>>(rows, cols);
-  PseudorandomGenerator<NativeT> generator(min_value, max_value, seed);
+  test_utils::PseudorandomGenerator<NativeT> generator(min_value, max_value,
+                                                       seed);
   for (int y = 0; y < rows; ++y) {
     for (int x = 0; x < cols; ++x) {
       (*result)(y, x) = generator.get();
diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc
index 183bcf1dd3..0853feeebd 100644
--- a/tensorflow/compiler/xla/tests/client_test.cc
+++ b/tensorflow/compiler/xla/tests/client_test.cc
@@ -54,8 +54,8 @@ TEST_F(ClientTest, ExecuteWithLayout) {
               .ConsumeValueOrDie();
 
       std::unique_ptr<Literal> expected_literal =
-          Literal::CreateR2WithLayout<int32>(
-              {{11, 22}, {33, 44}}, LayoutUtil::MakeLayout(transfer_layout));
+          test_utils::CreateR2LiteralWithLayout<int32>({{11, 22}, {33, 44}},
+                                                       transfer_layout);
 
       auto computed = client_->Transfer(*data, &expected_literal->shape());
 
diff --git a/tensorflow/compiler/xla/tests/compilation_cache_test.cc b/tensorflow/compiler/xla/tests/compilation_cache_test.cc
index 0f780fa87e..707e439245 100644
--- a/tensorflow/compiler/xla/tests/compilation_cache_test.cc
+++ b/tensorflow/compiler/xla/tests/compilation_cache_test.cc
@@ -138,13 +138,13 @@ XLA_TEST_F(CompilationCacheTest, DifferentParameterLayouts) {
   // layouts. Use these arrays as parameters to a simple computation. If the
   // layout of the array changes then computation should be recompiled (cache
   // miss).
-  auto rowmaj_array = Literal::CreateR2WithLayout(
-      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LayoutUtil::MakeLayout({1, 0}));
+  auto rowmaj_array = test_utils::CreateR2LiteralWithLayout(
+      {{1.0f, 2.0f}, {3.0f, 4.0f}}, /*minor_to_major=*/{1, 0});
   auto rowmaj_handle =
       client_->TransferToServer(*rowmaj_array).ConsumeValueOrDie();
 
-  auto colmaj_array = Literal::CreateR2WithLayout(
-      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LayoutUtil::MakeLayout({0, 1}));
+  auto colmaj_array = test_utils::CreateR2LiteralWithLayout(
+      {{1.0f, 2.0f}, {3.0f, 4.0f}}, /*minor_to_major=*/{0, 1});
   auto colmaj_handle =
       client_->TransferToServer(*colmaj_array).ConsumeValueOrDie();
 
diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc
index 5226a78386..d423c78476 100644
--- a/tensorflow/compiler/xla/tests/compute_constant_test.cc
+++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc
@@ -264,8 +264,8 @@ XLA_TEST_F(ComputeConstantTest, Layout) {
       ASSERT_TRUE(computed.ok()) << computed.status();
 
       std::unique_ptr<Literal> expected_literal =
-          Literal::CreateR2WithLayout<int32>({{11, 22}, {33, 44}},
-                                             LayoutUtil::MakeLayout(layout));
+          test_utils::CreateR2LiteralWithLayout<int32>({{11, 22}, {33, 44}},
+                                                       layout);
       LiteralTestUtil::AssertEqualShapesAndLayouts(
           expected_literal->shape(), computed.ValueOrDie()->shape());
       LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 7425f778a6..0cc2e5fb7e 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -82,127 +82,177 @@ XLA_TEST_F(ConvolutionTest, ForwardPassConvolution_3x3x256_256_OutputZ_Iota) {
   ComputationBuilder builder(client_, TestName());
   auto lhs = builder.ConstantR4FromArray4D<float>(*alhs);
   auto rhs = builder.ConstantR4FromArray4D<float>(*arhs);
-  auto conv = builder.Conv(lhs, rhs, {1, 1}, Padding::kValid);
+  builder.Conv(lhs, rhs, {1, 1}, Padding::kValid);
 
-  ComputeAndCompare(&builder, conv, {}, error_spec_);
+  std::unique_ptr<Array4D<float>> aexpected =
+      ReferenceUtil::ConvArray4D(*alhs, *arhs, {1, 1}, Padding::kValid);
+
+  ComputeAndCompareR4<float>(&builder, *aexpected, {}, error_spec_);
 }
 
 TEST_F(ConvolutionTest, Convolve_1x1x1x2_1x1x1x2_Valid) {
   ComputationBuilder builder(client_, TestName());
-  Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2});
-  Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2});
-  auto input = builder.Parameter(0, input_shape, "input");
-  auto filter = builder.Parameter(1, filter_shape, "filter");
-  auto conv = builder.Conv(input, filter, {1, 1}, Padding::kValid);
-
-  Array4D<float> input_data(1, 1, 1, 2);
-  input_data.FillWithYX(Array2D<float>({
+  {
+    Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2});
+    Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2});
+    auto input = builder.Parameter(0, input_shape, "input");
+    auto filter = builder.Parameter(1, filter_shape, "filter");
+    builder.Conv(input, filter, {1, 1}, Padding::kValid);
+  }
+
+  Array4D<float> input(1, 1, 1, 2);
+  input.FillWithYX(Array2D<float>({
       {1, 2},
   }));
-  Array4D<float> filter_data(1, 1, 1, 2);
-  filter_data.FillWithYX(Array2D<float>({
+  Array4D<float> filter(1, 1, 1, 2);
+  filter.FillWithYX(Array2D<float>({
       {5, 6},
   }));
 
-  ComputeAndCompare(&builder, conv,
-                    {*Literal::CreateFromArray(input_data),
-                     *Literal::CreateFromArray(filter_data)},
-                    error_spec_);
+  std::unique_ptr<Array4D<float>> aexpected =
+      ReferenceUtil::ConvArray4D(input, filter, {1, 1}, Padding::kValid);
+
+  auto input_literal =
+      client_->TransferToServer(*Literal::CreateR4FromArray4D(input))
+          .ConsumeValueOrDie();
+  auto filter_literal =
+      client_->TransferToServer(*Literal::CreateR4FromArray4D(filter))
+          .ConsumeValueOrDie();
+
+  ComputeAndCompareR4<float>(&builder, *aexpected,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
 }
 
 // Tests valid padding for 2D convolution in raster space.
 TEST_F(ConvolutionTest, Convolve_1x1x4x4_1x1x2x2_Valid) {
   ComputationBuilder builder(client_, TestName());
-  Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 4, 4});
-  Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 1, 2, 2});
-  auto input = builder.Parameter(0, input_shape, "input");
-  auto filter = builder.Parameter(1, filter_shape, "filter");
-  auto conv = builder.Conv(input, filter, {1, 1}, Padding::kValid);
+  {
+    Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 4, 4});
+    Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 1, 2, 2});
+    auto input = builder.Parameter(0, input_shape, "input");
+    auto filter = builder.Parameter(1, filter_shape, "filter");
+    builder.Conv(input, filter, {1, 1}, Padding::kValid);
+  }
 
-  Array4D<float> input_data(1, 1, 4, 4);
+  Array4D<float> input(1, 1, 4, 4);
   // clang-format off
-  input_data.FillWithYX(Array2D<float>({
+  input.FillWithYX(Array2D<float>({
     {1,  2,  3,  4 },
     {5,  6,  7,  8 },
     {9,  10, 11, 12},
     {13, 14, 15, 16},
   }));
   // clang-format on
-  Array4D<float> filter_data(1, 1, 2, 2);
+  Array4D<float> filter(1, 1, 2, 2);
   // clang-format off
-  filter_data.FillWithYX(Array2D<float>({
+  filter.FillWithYX(Array2D<float>({
     {5, 6},
     {7, 8},
   }));
   // clang-format on
-  ComputeAndCompare(&builder, conv,
-                    {*Literal::CreateFromArray(input_data),
-                     *Literal::CreateFromArray(filter_data)},
-                    error_spec_);
+
+  std::unique_ptr<Array4D<float>> aexpected =
+      ReferenceUtil::ConvArray4D(input, filter, {1, 1}, Padding::kValid);
+
+  auto input_literal =
+      client_->TransferToServer(*Literal::CreateR4FromArray4D(input))
+          .ConsumeValueOrDie();
+  auto filter_literal =
+      client_->TransferToServer(*Literal::CreateR4FromArray4D(filter))
+          .ConsumeValueOrDie();
+
+  ComputeAndCompareR4<float>(&builder, *aexpected,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
 }
 
 // Tests same padding for 2D convolution in raster space.
 TEST_F(ConvolutionTest, Convolve_1x1x4x4_1x1x2x2_Same) {
   ComputationBuilder builder(client_, TestName());
-  Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 4, 4});
-  Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 1, 2, 2});
-  auto input = builder.Parameter(0, input_shape, "input");
-  auto filter = builder.Parameter(1, filter_shape, "filter");
-  auto conv = builder.Conv(input, filter, {1, 1}, Padding::kSame);
+  {
+    Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 4, 4});
+    Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 1, 2, 2});
+    auto input = builder.Parameter(0, input_shape, "input");
+    auto filter = builder.Parameter(1, filter_shape, "filter");
+    builder.Conv(input, filter, {1, 1}, Padding::kSame);
+  }
 
-  Array4D<float> input_data(1, 1, 4, 4);
+  Array4D<float> input(1, 1, 4, 4);
   // clang-format off
-  input_data.FillWithYX(Array2D<float>({
+  input.FillWithYX(Array2D<float>({
     {1,  2,  3,  4 },
     {5,  6,  7,  8 },
     {9,  10, 11, 12},
     {13, 14, 15, 16},
   }));
   // clang-format on
-  Array4D<float> filter_data(1, 1, 2, 2);
+  Array4D<float> filter(1, 1, 2, 2);
   // clang-format off
-  filter_data.FillWithYX(Array2D<float>({
+  filter.FillWithYX(Array2D<float>({
     {5, 6},
     {7, 8},
   }));
   // clang-format on
-  ComputeAndCompare(&builder, conv,
-                    {*Literal::CreateFromArray(input_data),
-                     *Literal::CreateFromArray(filter_data)},
-                    error_spec_);
+
+  std::unique_ptr<Array4D<float>> aexpected =
+      ReferenceUtil::ConvArray4D(input, filter, {1, 1}, Padding::kSame);
+
+  auto input_literal =
+      client_->TransferToServer(*Literal::CreateR4FromArray4D(input))
+          .ConsumeValueOrDie();
+  auto filter_literal =
+      client_->TransferToServer(*Literal::CreateR4FromArray4D(filter))
+          .ConsumeValueOrDie();
+
+  ComputeAndCompareR4<float>(&builder, *aexpected,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
 }
 
 // Tests same padding for 2D convolution in raster space with an odd sized
 // kernel.
 TEST_F(ConvolutionTest, Convolve_1x1x4x4_1x1x3x3_Same) {
   ComputationBuilder builder(client_, TestName());
-  Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 4, 4});
-  Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 1, 3, 3});
-  auto input = builder.Parameter(0, input_shape, "input");
-  auto filter = builder.Parameter(1, filter_shape, "filter");
-  auto conv = builder.Conv(input, filter, {1, 1}, Padding::kSame);
+  {
+    Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 4, 4});
+    Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 1, 3, 3});
+    auto input = builder.Parameter(0, input_shape, "input");
+    auto filter = builder.Parameter(1, filter_shape, "filter");
+    builder.Conv(input, filter, {1, 1}, Padding::kSame);
+  }
 
-  Array4D<float> input_data(1, 1, 4, 4);
+  Array4D<float> input(1, 1, 4, 4);
   // clang-format off
-  input_data.FillWithYX(Array2D<float>({
+  input.FillWithYX(Array2D<float>({
     {1,  2,  3,  4 },
     {5,  6,  7,  8 },
     {9,  10, 11, 12},
     {13, 14, 15, 16},
   }));
   // clang-format on
-  Array4D<float> filter_data(1, 1, 3, 3);
+  Array4D<float> filter(1, 1, 3, 3);
   // clang-format off
-  filter_data.FillWithYX(Array2D<float>({
+  filter.FillWithYX(Array2D<float>({
     { 5,  6,  7},
     { 8,  9, 10},
     {11, 12, 13},
   }));
   // clang-format on
-  ComputeAndCompare(&builder, conv,
-                    {*Literal::CreateFromArray(input_data),
-                     *Literal::CreateFromArray(filter_data)},
-                    error_spec_);
+
+  std::unique_ptr<Array4D<float>> aexpected =
+      ReferenceUtil::ConvArray4D(input, filter, {1, 1}, Padding::kSame);
+
+  auto input_literal =
+      client_->TransferToServer(*Literal::CreateR4FromArray4D(input))
+          .ConsumeValueOrDie();
+  auto filter_literal =
+      client_->TransferToServer(*Literal::CreateR4FromArray4D(filter))
+          .ConsumeValueOrDie();
+
+  ComputeAndCompareR4<float>(&builder, *aexpected,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
 }
 
 XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_Valid) {
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index b72dd2707c..cf089d748d 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -177,15 +177,15 @@ void DotOperationTest::TestSquareMatrixDot(bool lhs_row_major,
                                            bool rhs_row_major) {
   auto lhs_handle =
       client_
-          ->TransferToServer(*Literal::CreateR2WithLayout<Element>(
+          ->TransferToServer(*test_utils::CreateR2LiteralWithLayout<Element>(
               {{1.0, 2.0}, {3.0, -4.0}},
-              LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(lhs_row_major))))
+              MinorToMajorForIsRowMajor(lhs_row_major)))
           .ConsumeValueOrDie();
   auto rhs_handle =
       client_
-          ->TransferToServer(*Literal::CreateR2WithLayout<Element>(
+          ->TransferToServer(*test_utils::CreateR2LiteralWithLayout<Element>(
               {{1.0, 6.0}, {7.0, -4.0}},
-              LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(rhs_row_major))))
+              MinorToMajorForIsRowMajor(rhs_row_major)))
           .ConsumeValueOrDie();
 
   ComputationBuilder builder(client_, TestName());
@@ -277,62 +277,6 @@ XLA_TEST_F(DotOperationTest, MatrixDotF32_260_3_520_MinorToMajorFF) {
   TestMatrixDot(260, 3, 520, false, false);
 }
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x8x8) {
-  TestMatrixDot(1, 8, 8, true, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x130x8) {
-  TestMatrixDot(1, 130, 8, true, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x8x130) {
-  TestMatrixDot(1, 8, 130, true, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x290x130) {
-  TestMatrixDot(1, 290, 130, true, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_2x1x1) {
-  TestMatrixDot(2, 1, 1, true, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_8x8x1) {
-  TestMatrixDot(8, 8, 1, true, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_16x1x1) {
-  TestMatrixDot(16, 1, 1, true, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_16x3x1) {
-  TestMatrixDot(16, 3, 1, true, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_3x3x1) {
-  TestMatrixDot(3, 3, 1, true, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_29x29x1) {
-  TestMatrixDot(29, 29, 1, true, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x8x2) {
-  TestMatrixDot(1, 8, 2, true, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x2x8) {
-  TestMatrixDot(1, 2, 8, true, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_259x258x1) {
-  TestMatrixDot(259, 258, 1, true, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_259x258x1_FT) {
-  TestMatrixDot(259, 258, 1, false, true);
-}
-
 XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorFF) {
   constexpr bool kLhsRowMajor = false;
   constexpr bool kRhsRowMajor = false;
@@ -362,15 +306,15 @@ void DotOperationTest::TestNonsquareMatrixDot(bool lhs_row_major,
                                               bool rhs_row_major) {
   auto lhs_handle =
       client_
-          ->TransferToServer(*Literal::CreateR2WithLayout<Element>(
+          ->TransferToServer(*test_utils::CreateR2LiteralWithLayout<Element>(
               {{1.0, 2.0, 3.0}, {3.0, -4.0, -1.0}},
-              LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(lhs_row_major))))
+              MinorToMajorForIsRowMajor(lhs_row_major)))
           .ConsumeValueOrDie();
   auto rhs_handle =
       client_
-          ->TransferToServer(*Literal::CreateR2WithLayout<Element>(
+          ->TransferToServer(*test_utils::CreateR2LiteralWithLayout<Element>(
               {{1.0, 6.0}, {2.0, 3.0}, {7.0, -4.0}},
-              LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(rhs_row_major))))
+              MinorToMajorForIsRowMajor(rhs_row_major)))
           .ConsumeValueOrDie();
 
   ComputationBuilder builder(client_, TestName());
@@ -417,31 +361,6 @@ XLA_TEST_F(DotOperationTest, NonsquareMatrixDotC64) {
   TestNonsquareMatrixDot<complex64>();
 }
 
-XLA_TEST_F(DotOperationTest, MatrixVectorC64) {
-  auto lhs_handle =
-      client_
-          ->TransferToServer(*Literal::CreateR2WithLayout<complex64>(
-              {{1.0, 2.0, 3.0, -4.0}}, LayoutUtil::MakeLayout({1, 0})))
-          .ConsumeValueOrDie();
-  auto rhs_handle =
-      client_
-          ->TransferToServer(*Literal::CreateR2WithLayout<complex64>(
-              {{1.0, 1.0}, {2.0, 2.0}, {3.0, 3.0}, {-4.0, 4.0}},
-              LayoutUtil::MakeLayout({1, 0})))
-          .ConsumeValueOrDie();
-
-  ComputationBuilder builder(client_, TestName());
-  auto prim_type = primitive_util::NativeToPrimitiveType<complex64>();
-  auto result = builder.Dot(
-      builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {1, 4}), "lhs"),
-      builder.Parameter(1, ShapeUtil::MakeShape(prim_type, {4, 2}), "rhs"));
-
-  Array2D<complex64> expected({{30.0, -2.0}});
-
-  ComputeAndCompareR2<complex64>(
-      &builder, expected, {lhs_handle.get(), rhs_handle.get()}, error_spec_);
-}
-
 XLA_TEST_F(DotOperationTest, ConcurrentMatMul) {
   ComputationBuilder builder(client_, TestName());
   auto matrix1 = builder.ConstantR2<float>({{1.0, 2.0}, {3.0, 4.0}});
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index 75c9a0d3fb..95a52ecd2f 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -116,18 +116,16 @@ template <typename FloatT, typename UnsignedT>
 ::testing::AssertionResult CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs) {
   auto ulhs = tensorflow::bit_cast<UnsignedT>(lhs);
   auto urhs = tensorflow::bit_cast<UnsignedT>(rhs);
-  auto lhs_double = static_cast<double>(lhs);
-  auto rhs_double = static_cast<double>(rhs);
   if (ulhs != urhs) {
     return ::testing::AssertionFailure() << tensorflow::strings::Printf(
                "floating values are not bitwise-equal; and equality testing "
                "was requested: %s=%g=%a vs %s=%g=%a",
                tensorflow::strings::StrCat(tensorflow::strings::Hex(ulhs))
                    .c_str(),
-               lhs_double, lhs_double,
+               lhs, lhs,
                tensorflow::strings::StrCat(tensorflow::strings::Hex(urhs))
                    .c_str(),
-               rhs_double, rhs_double);
+               rhs, rhs);
   }
   return ::testing::AssertionSuccess();
 }
@@ -151,10 +149,6 @@ template <typename NativeT>
 // Specializations for floating types that do bitwise comparisons when equality
 // comparison is requested.
 template <>
-::testing::AssertionResult CompareEqual<bfloat16>(bfloat16 lhs, bfloat16 rhs) {
-  return CompareFloatsBitwiseEqual<bfloat16, uint16>(lhs, rhs);
-}
-template <>
 ::testing::AssertionResult CompareEqual<float>(float lhs, float rhs) {
   return CompareFloatsBitwiseEqual<float, uint32>(lhs, rhs);
 }
@@ -244,9 +238,6 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
     case U64:
       match = ExpectLiteralsEqual<uint64>(expected, actual, &multi_index, 0);
       break;
-    case BF16:
-      match = ExpectLiteralsEqual<bfloat16>(expected, actual, &multi_index, 0);
-      break;
     case F32:
       match = ExpectLiteralsEqual<float>(expected, actual, &multi_index, 0);
       break;
diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
index 70d8b764a3..458258e7ee 100644
--- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
+++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
@@ -14,118 +14,49 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/llvm_compiler.h"
-#include "tensorflow/compiler/xla/service/backend.h"
-#include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/platform_util.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/stream_executor/stream_executor.h"
 
 namespace xla {
 namespace {
 
-class LLVMCompilerTest : public ::testing::Test {
- public:
-  void SetUp() override {
-    Platform *platform = FindPlatform();
-    ASSERT_NE(platform, nullptr);
-
-    BackendOptions backend_options;
-    backend_options.set_platform(platform);
-    StatusOr<std::unique_ptr<Backend>> backend_or_status =
-        Backend::CreateBackend(backend_options);
-    ASSERT_IS_OK(backend_or_status.status());
-    backend_ = backend_or_status.ConsumeValueOrDie();
-  }
-
-  ~LLVMCompilerTest() override {}
-
- protected:
-  using Platform = ::perftools::gputools::Platform;
-
-  explicit LLVMCompilerTest(string platform_name)
-      : platform_name_(std::move(platform_name)) {}
-
-  void TestCompilerHooks(LLVMCompiler *compiler) {
-    int pre_opt_hook_call_count = 0;
-    int post_opt_hook_call_count = 0;
-
-    auto pre_opt_hook = [&pre_opt_hook_call_count](const llvm::Module &) {
-      ++pre_opt_hook_call_count;
-      return Status::OK();
-    };
-    auto post_opt_hook = [&post_opt_hook_call_count](const llvm::Module &) {
-      ++post_opt_hook_call_count;
-      return Status::OK();
-    };
-
-    // Create HLO module, and run the compiler.
-    auto builder = HloComputation::Builder(TestName());
-    builder.AddInstruction(
-        HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0)));
-
-    auto hlo_module = CreateNewModule();
-    hlo_module->AddEntryComputation(builder.Build());
-
-    compiler->SetPreOptimizationHook(pre_opt_hook);
-    compiler->SetPostOptimizationHook(post_opt_hook);
-
-    ASSERT_TRUE(compiler
-                    ->Compile(std::move(hlo_module),
-                              backend_->default_stream_executor())
-                    .ok());
-
-    // Test that hooks were called.
-    EXPECT_EQ(1, pre_opt_hook_call_count);
-    EXPECT_EQ(1, post_opt_hook_call_count);
-  }
-
- private:
-  Platform *FindPlatform() {
-    for (Platform *platform :
-         PlatformUtil::GetSupportedPlatforms().ConsumeValueOrDie()) {
-      if (platform->Name() == platform_name_) {
-        return platform;
-      }
-    }
-    return nullptr;
-  }
-
-  string platform_name_;
-  std::unique_ptr<Backend> backend_;
-
-  static string TestName() {
-    return ::testing::UnitTest::GetInstance()->current_test_info()->name();
-  }
-
-  static std::unique_ptr<HloModule> CreateNewModule() {
-    HloModuleConfig config;
-    config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
-    return MakeUnique<HloModule>(TestName(), VersionedComputationHandle(),
-                                 config);
-  }
-};
-
-class CpuCompilerTest : public LLVMCompilerTest {
- public:
-  CpuCompilerTest() : LLVMCompilerTest("Host") {}
-};
-
-class GpuCompilerTest : public LLVMCompilerTest {
- public:
-  GpuCompilerTest() : LLVMCompilerTest("CUDA") {}
-};
-
-TEST_F(CpuCompilerTest, HooksTest) {
-  cpu::CpuCompiler compiler;
-  TestCompilerHooks(&compiler);
-}
-
-TEST_F(GpuCompilerTest, HooksTest) {
-  gpu::GpuCompiler compiler;
-  TestCompilerHooks(&compiler);
+class LLVMCompilerTest : public HloTestBase {};
+
+XLA_TEST_F(LLVMCompilerTest, CompilerHooks) {
+  int pre_opt_hook_call_count = 0;
+  int post_opt_hook_call_count = 0;
+
+  auto pre_opt_hook = [&pre_opt_hook_call_count](const llvm::Module &) {
+    ++pre_opt_hook_call_count;
+    return Status::OK();
+  };
+  auto post_opt_hook = [&post_opt_hook_call_count](const llvm::Module &) {
+    ++post_opt_hook_call_count;
+    return Status::OK();
+  };
+
+  // Create HLO module, and run the compiler.
+  auto builder = HloComputation::Builder(TestName());
+  builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0)));
+
+  auto hlo_module = CreateNewModule();
+  hlo_module->AddEntryComputation(builder.Build());
+
+  auto compiler = static_cast<LLVMCompiler *>(backend().compiler());
+  compiler->SetPreOptimizationHook(pre_opt_hook);
+  compiler->SetPostOptimizationHook(post_opt_hook);
+
+  ASSERT_TRUE(
+      compiler
+          ->Compile(std::move(hlo_module), backend().default_stream_executor())
+          .ok());
+
+  // Test that hooks were called.
+  EXPECT_EQ(1, pre_opt_hook_call_count);
+  EXPECT_EQ(1, post_opt_hook_call_count);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index a196e250d1..329b53012f 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -136,14 +136,16 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentInputLayouts) {
   auto computation = builder.Build().ConsumeValueOrDie();
 
   // Create x as a col-major array.
-  auto x_array = LiteralToShapedBuffer(*Literal::CreateR2WithLayout(
-      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LayoutUtil::MakeLayout({0, 1})));
+  auto x_array = LiteralToShapedBuffer(
+      *test_utils::CreateR2LiteralWithLayout({{1.0f, 2.0f}, {3.0f, 4.0f}},
+                                             /*minor_to_major=*/{0, 1}));
   EXPECT_TRUE(LayoutUtil::Equal(x_array->shape().layout(),
                                 LayoutUtil::MakeLayout({0, 1})));
 
   // Create y as a row-major array.
-  auto y_array = LiteralToShapedBuffer(*Literal::CreateR2WithLayout(
-      {{10.0f, 20.0f}, {30.0f, 40.0f}}, LayoutUtil::MakeLayout({1, 0})));
+  auto y_array = LiteralToShapedBuffer(
+      *test_utils::CreateR2LiteralWithLayout({{10.0f, 20.0f}, {30.0f, 40.0f}},
+                                             /*minor_to_major=*/{1, 0}));
   EXPECT_TRUE(LayoutUtil::Equal(y_array->shape().layout(),
                                 LayoutUtil::MakeLayout({1, 0})));
 
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index d98875dbc2..c11e1df0a7 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -12,12 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#define EIGEN_USE_THREADS
 
 #include "tensorflow/compiler/xla/tests/local_client_test_base.h"
 
 #include <vector>
 
+#define EIGEN_USE_THREADS
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/map_util.h"
diff --git a/tensorflow/compiler/xla/tests/map_test.cc b/tensorflow/compiler/xla/tests/map_test.cc
index 2b0f7e6e80..2ef392508d 100644
--- a/tensorflow/compiler/xla/tests/map_test.cc
+++ b/tensorflow/compiler/xla/tests/map_test.cc
@@ -405,13 +405,13 @@ TEST_F(MapTest, MapBinaryAdder) {
 // for Map that used to fail in shape inference (b/28989438).
 XLA_TEST_F(MapTest, AddWithMixedLayouts) {
   ComputationBuilder builder(client_, TestName());
-  std::unique_ptr<Literal> param0_literal = Literal::CreateR2WithLayout(
-      {{1, 2}, {3, 4}}, LayoutUtil::MakeLayout({1, 0}));
+  std::unique_ptr<Literal> param0_literal =
+      test_utils::CreateR2LiteralWithLayout({{1, 2}, {3, 4}}, {1, 0});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
-  std::unique_ptr<Literal> param1_literal = Literal::CreateR2WithLayout(
-      {{10, 20}, {30, 40}}, LayoutUtil::MakeLayout({0, 1}));
+  std::unique_ptr<Literal> param1_literal =
+      test_utils::CreateR2LiteralWithLayout({{10, 20}, {30, 40}}, {0, 1});
   std::unique_ptr<GlobalData> param1_data =
       client_->TransferToServer(*param1_literal).ConsumeValueOrDie();
 
diff --git a/tensorflow/compiler/xla/tests/reshape_test.cc b/tensorflow/compiler/xla/tests/reshape_test.cc
index d235b9a158..72c68f24a0 100644
--- a/tensorflow/compiler/xla/tests/reshape_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_test.cc
@@ -431,9 +431,8 @@ XLA_TEST_F(ReshapeTest, ToScalar) {
 XLA_TEST_F(ReshapeTest, BadDimensions) {
   ComputationBuilder b(client_, TestName());
   b.Reshape(b.ConstantR1<int32>({1}), {}, {});
-  EXPECT_THAT(
-      ExecuteToString(&b, {}),
-      ::testing::HasSubstr("not a permutation of the operand dimensions"));
+  EXPECT_THAT(ExecuteToString(&b, {}),
+              ::testing::HasSubstr("dimensions not a permutation"));
 }
 
 XLA_TEST_F(ReshapeTest, BadNewSizes) {
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
deleted file mode 100644
index cdd3d66bbb..0000000000
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/tests/test_utils.h"
-
-#include "tensorflow/compiler/xla/primitive_util.h"
-
-namespace xla {
-
-namespace {
-
-template <typename FloatT>
-void PopulateWithRandomFloatingPointData(Literal* literal) {
-  CHECK_EQ(literal->shape().element_type(),
-           primitive_util::NativeToPrimitiveType<FloatT>());
-  std::minstd_rand0 engine;
-  std::uniform_real_distribution<FloatT> generator(0.0f, 1.0f);
-  TF_CHECK_OK(literal->Populate<FloatT>(
-      [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-        return generator(engine);
-      }));
-}
-
-template <typename IntT>
-void PopulateWithRandomIntegralData(Literal* literal) {
-  CHECK_EQ(literal->shape().element_type(),
-           primitive_util::NativeToPrimitiveType<IntT>());
-  std::minstd_rand0 engine;
-  std::uniform_int_distribution<IntT> generator(
-      std::numeric_limits<IntT>::lowest(), std::numeric_limits<IntT>::max());
-  TF_CHECK_OK(literal->Populate<IntT>(
-      [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-        return generator(engine);
-      }));
-}
-
-}  // namespace
-
-StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape) {
-  if (ShapeUtil::IsTuple(shape)) {
-    std::vector<std::unique_ptr<Literal>> elements;
-    for (const Shape& element_shape : shape.tuple_shapes()) {
-      TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> element,
-                          MakeFakeLiteral(element_shape));
-      elements.push_back(std::move(element));
-    }
-    return Literal::MakeTupleOwned(std::move(elements));
-  }
-  std::unique_ptr<Literal> literal = Literal::CreateFromShape(shape);
-  switch (shape.element_type()) {
-    case F32:
-      PopulateWithRandomFloatingPointData<float>(literal.get());
-      break;
-    case F64:
-      PopulateWithRandomFloatingPointData<double>(literal.get());
-      break;
-    case S8:
-      PopulateWithRandomIntegralData<int8>(literal.get());
-      break;
-    case U8:
-      PopulateWithRandomIntegralData<uint8>(literal.get());
-      break;
-    case S16:
-      PopulateWithRandomIntegralData<int16>(literal.get());
-      break;
-    case U16:
-      PopulateWithRandomIntegralData<uint16>(literal.get());
-      break;
-    case S32:
-      PopulateWithRandomIntegralData<int32>(literal.get());
-      break;
-    case U32:
-      PopulateWithRandomIntegralData<uint32>(literal.get());
-      break;
-    case S64:
-      PopulateWithRandomIntegralData<int64>(literal.get());
-      break;
-    case U64:
-      PopulateWithRandomIntegralData<uint64>(literal.get());
-      break;
-    case PRED: {
-      std::uniform_int_distribution<int> generator(0, 1);
-      std::minstd_rand0 engine;
-      TF_CHECK_OK(literal->Populate<bool>(
-          [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-            return generator(engine);
-          }));
-      break;
-    }
-    default:
-      return Unimplemented("Unsupported type for fake literal generation: %s",
-                           ShapeUtil::HumanString(shape).c_str());
-  }
-  return std::move(literal);
-}
-
-StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
-    const HloModule& module) {
-  std::vector<std::unique_ptr<Literal>> arguments;
-  for (const ShapeLayout& shape_layout :
-       module.config().entry_computation_layout().parameter_layouts()) {
-    TF_ASSIGN_OR_RETURN(auto literal, MakeFakeLiteral(shape_layout.shape()));
-    arguments.push_back(std::move(literal));
-  }
-  return std::move(arguments);
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/test_utils.h b/tensorflow/compiler/xla/tests/test_utils.h
index 12d5255fce..f3a522b05e 100644
--- a/tensorflow/compiler/xla/tests/test_utils.h
+++ b/tensorflow/compiler/xla/tests/test_utils.h
@@ -23,12 +23,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
+namespace test_utils {
 
 // A class which generates pseudorandom numbers of a given type within a given
 // range. Not cryptographically secure and likely not perfectly evenly
@@ -53,15 +53,63 @@ class PseudorandomGenerator {
   std::mt19937 generator_;
 };
 
-// Generates fake data in a literal of the given shape, or returns an error
-// status if the element type is currently unhandled for fake data generation.
-StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape);
+// Convenience function for creating a rank-2 array with arbitrary layout.
+template <typename NativeT>
+std::unique_ptr<Literal> CreateR2LiteralWithLayout(
+    std::initializer_list<std::initializer_list<NativeT>> values,
+    tensorflow::gtl::ArraySlice<int64> minor_to_major) {
+  auto literal = MakeUnique<Literal>();
+  const int64 d0 = values.size();
+  const int64 d1 = values.begin()->size();
+  literal.get()->PopulateWithValue<NativeT>(0, {d0, d1});
+  *literal->mutable_shape()->mutable_layout() =
+      LayoutUtil::MakeLayout(minor_to_major);
+  TF_CHECK_OK(ShapeUtil::ValidateShape(literal->shape()));
+
+  int64 dim0 = 0;
+  for (auto inner_list : values) {
+    int64 dim1 = 0;
+    for (auto value : inner_list) {
+      literal.get()->Set({dim0, dim1}, value);
+      ++dim1;
+    }
+    ++dim0;
+  }
+  return literal;
+}
 
-// Generates a vector of arguments containing fake data. The number, shape and
-// layout of the arguments is appropriate for given HLO module.
-StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
-    const HloModule& module);
+// Convenience function for creating a rank-3 array with arbitrary layout.
+template <typename NativeT>
+std::unique_ptr<Literal> CreateR3LiteralWithLayout(
+    std::initializer_list<std::initializer_list<std::initializer_list<NativeT>>>
+        values,
+    tensorflow::gtl::ArraySlice<int64> minor_to_major) {
+  auto literal = MakeUnique<Literal>();
+  const int64 d0 = values.size();
+  const int64 d1 = values.begin()->size();
+  const int64 d2 = values.begin()->begin()->size();
+  literal.get()->PopulateWithValue<NativeT>(0, {d0, d1, d2});
+  *literal->mutable_shape()->mutable_layout() =
+      LayoutUtil::MakeLayout(minor_to_major);
+  TF_CHECK_OK(ShapeUtil::ValidateShape(literal->shape()));
+
+  int64 dim0 = 0;
+  for (auto inner_list : values) {
+    int64 dim1 = 0;
+    for (auto inner_inner_list : inner_list) {
+      int64 dim2 = 0;
+      for (auto value : inner_inner_list) {
+        literal.get()->Set({dim0, dim1, dim2}, value);
+        ++dim2;
+      }
+      ++dim1;
+    }
+    ++dim0;
+  }
+  return literal;
+}
 
+}  // namespace test_utils
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_TESTS_TEST_UTILS_H_
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 091fa0c3ec..759921dce5 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -88,7 +88,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:testing",
         "//tensorflow/compiler/xla/service:session_proto",
-        "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
     ],
diff --git a/tensorflow/compiler/xla/tools/parser/README.md b/tensorflow/compiler/xla/tools/parser/README.md
index b768b94e77..2c864d77a2 100644
--- a/tensorflow/compiler/xla/tools/parser/README.md
+++ b/tensorflow/compiler/xla/tools/parser/README.md
@@ -43,22 +43,14 @@ operand
   : shape name
   ;
 
-attributes
+extra_attributes
   : /*empty*/
-  | ',' attribute
-  | ',' attribute attributes
+  | ',' extra_attribute
+  | ',' extra_attribute extra_attributes
   ;
-attribute
+extra_attribute
   : attribute_name attribute_value
   ;
-attribute_value
-  : kInt
-  | kName
-  | [0-9bf]{3,}_[0-9io]{3,}->[0-9bf]{3,}                /*dim_labels_pattern*/
-  | [0-9]+(x[0-9]+)+                                    /*dxd_pattern*/
-  | [0-9]+_[0-9]+(_[0-9]+)?(x[0-9]+_[0-9]+(_[0-9]+)?)*  /*pad_pattern*/
-  | '{' sub_attributes '}'
-  ;
 
 param_list
   : '(' param_list1 ')'
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_lexer.cc b/tensorflow/compiler/xla/tools/parser/hlo_lexer.cc
index b5befbf58b..d104ff3460 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_lexer.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_lexer.cc
@@ -122,7 +122,7 @@ TokKind HloLexer::LexToken() {
           current_ptr_++;
           return TokKind::kArrow;
         }
-        return LexNumberOrPattern();
+        return LexDigitOrNegative();
       case '=':
         return TokKind::kEqual;
       case ',':
@@ -149,15 +149,12 @@ TokKind HloLexer::LexToken() {
   }
 }
 
-// Lex a shape, name, keyword, opcode, attribute name, or the dim labels
-// pattern.
-//
+// Lex a shape, name, keyword, or opcode.
 // shape    ::= ([a-zA-Z0-9_]*[0-9]*)\[([0-9,]*)\](?:\s*{([0-9,]*)})?
 // name     ::= [a-zA-Z_][a-zA-Z0-9_.-]*:
 // keyword  ::= HloModule, ENTRY, ...
 // opcode   ::= add, greater-than, ...
 // attribute_name ::= condition, body, dimensions, ...
-// dim_labels_pattern ::= [0-9bf]{3,}_[0-9io]{3,}->[0-9bf]{3,}
 TokKind HloLexer::LexIdentifier() {
   {
     auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end());
@@ -223,16 +220,6 @@ TokKind HloLexer::LexIdentifier() {
     return TokKind::kOpcode;
   }
 
-  {
-    auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end());
-    static LazyRE2 dim_labels_pattern = {
-        R"([0-9bf]{3,}_[0-9io]{3,}->[0-9bf]{3,})"};
-    if (RE2::Consume(&consumable, *dim_labels_pattern)) {
-      current_ptr_ = consumable.begin();
-      str_val_.assign(token_start_, current_ptr_);
-      return TokKind::kDimLabels;
-    }
-  }
   current_ptr_ = token_start_ + 1;
   return TokKind::kError;
 }
@@ -253,20 +240,15 @@ TokKind HloLexer::LexPercent() {
   return TokKind::kError;
 }
 
-// Lex integer and floating-point values, -inf, and patterns for dim labels,
-// dxd (e.g. 1x2x3), and pad.
-//
-// fp with exp ::= [-]?([0-9]+|[0-9]+[.][0-9]*|[0-9]*[.][0-9]+)([eE][+-]?[0-9]+)
-// fp without exp ::= [-]?([0-9]+[.][0-9]*|[0-9]*[.][0-9]+)
-// dim_labels_pattern ::= [0-9bf]{3,}_[0-9io]{3,}->[0-9bf]{3,}
-// dxd_pattern ::= [0-9]+(x[0-9]+)+
-// pad_pattern ::= [0-9]+_[0-9]+(_[0-9]+)?(x[0-9]+_[0-9]+(_[0-9]+)?)*
-// int ::=  [-]?[0-9]+
-// negative inf ::= '-inf'
-TokKind HloLexer::LexNumberOrPattern() {
+// Lex integer and floating-point values, and -inf.
+// int             [-]?[0-9]+
+// fp with exp     [-]?([0-9]+|[0-9]+[.][0-9]*|[0-9]*[.][0-9]+)([eE][+-]?[0-9]+)
+// fp without exp  [-]?([0-9]+[.][0-9]*|[0-9]*[.][0-9]+)
+// negative inf    -inf
+TokKind HloLexer::LexDigitOrNegative() {
   auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end());
   static LazyRE2 float_pattern = {
-      R"([-]?((\d+|\d+[.]\d*|\d*[.]\d+)([eE][+-]?\d+))|[-]?(\d+[.]\d*|\d*[.]\d+))"};
+      R"([-]?((\d+|\d+[.]\d*|\d*[.]\d+)([eE][+-]?\d+))|(\d+[.]\d*|\d*[.]\d+))"};
   if (RE2::Consume(&consumable, *float_pattern)) {
     current_ptr_ = consumable.begin();
     tensorflow::strings::safe_strtod(string(token_start_, current_ptr_).c_str(),
@@ -274,30 +256,6 @@ TokKind HloLexer::LexNumberOrPattern() {
     return TokKind::kDecimal;
   }
 
-  static LazyRE2 dim_labels_pattern = {
-      R"([0-9bf]{3,}_[0-9io]{3,}->[0-9bf]{3,})"};
-  static LazyRE2 dxd_pattern = {R"([0-9]+(x[0-9]+)+)"};
-  static LazyRE2 pad_pattern = {
-      R"([0-9]+_[0-9]+(_[0-9]+)?(x[0-9]+_[0-9]+(_[0-9]+)?)*)"};
-
-  if (RE2::Consume(&consumable, *dim_labels_pattern)) {
-    current_ptr_ = consumable.begin();
-    str_val_.assign(token_start_, current_ptr_);
-    return TokKind::kDimLabels;
-  }
-
-  if (RE2::Consume(&consumable, *dxd_pattern)) {
-    current_ptr_ = consumable.begin();
-    str_val_.assign(token_start_, current_ptr_);
-    return TokKind::kDxD;
-  }
-
-  if (RE2::Consume(&consumable, *pad_pattern)) {
-    current_ptr_ = consumable.begin();
-    str_val_.assign(token_start_, current_ptr_);
-    return TokKind::kPad;
-  }
-
   static LazyRE2 int_pattern = {R"([-]?\d+)"};
   if (RE2::Consume(&consumable, *int_pattern)) {
     current_ptr_ = consumable.begin();
@@ -392,12 +350,6 @@ string TokKindToString(TokKind kind) {
       return "kName";
     case TokKind::kAttributeName:
       return "kAttributeName";
-    case TokKind::kDimLabels:
-      return "kDimLabels";
-    case TokKind::kDxD:
-      return "kDxD";
-    case TokKind::kPad:
-      return "kPad";
     case TokKind::kShape:
       return "kShape";
     case TokKind::kOpcode:
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_lexer.h b/tensorflow/compiler/xla/tools/parser/hlo_lexer.h
index 79c4f271a1..3b9efcb92d 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_lexer.h
+++ b/tensorflow/compiler/xla/tools/parser/hlo_lexer.h
@@ -37,15 +37,11 @@ class HloLexer {
   }
 
   TokKind Lex() { return current_kind_ = LexToken(); }
-
   TokKind GetKind() const { return current_kind_; }
   string GetStrVal() const {
     switch (GetKind()) {
       case TokKind::kName:
       case TokKind::kAttributeName:
-      case TokKind::kDimLabels:
-      case TokKind::kDxD:
-      case TokKind::kPad:
         return str_val_;
       default:
         LOG(FATAL) << "This token does not have string value";
@@ -96,7 +92,7 @@ class HloLexer {
   TokKind LexPercent();
   TokKind LexShape();
   TokKind LexConstant();
-  TokKind LexNumberOrPattern();
+  TokKind LexDigitOrNegative();
   TokKind LexComment();
 
   const tensorflow::StringPiece buf_;
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
index fed0492a54..6c2e37e3b5 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
@@ -28,9 +28,6 @@ namespace tools {
 namespace {
 
 using tensorflow::StringPiece;
-using tensorflow::gtl::optional;
-using tensorflow::str_util::Split;
-using tensorflow::str_util::SplitAndParseAsInts;
 using tensorflow::strings::Printf;
 using tensorflow::strings::StrAppend;
 using tensorflow::strings::StrCat;
@@ -60,6 +57,7 @@ class HloParser {
   bool ParseInstructionList(HloComputation::Builder* builder,
                             string* root_name);
   bool ParseInstruction(HloComputation::Builder* builder, string* root_name);
+  bool ParseSharding(HloInstruction* instruction);
   bool ParseControlPredecessors(HloInstruction* instruction);
   bool ParseLiteral(std::unique_ptr<Literal>* literal, const Shape& shape);
   bool ParseTupleLiteral(std::unique_ptr<Literal>* literal, const Shape& shape);
@@ -80,73 +78,10 @@ class HloParser {
   bool ParseOperands(std::vector<HloInstruction*>* operands,
                      const int expected_size);
 
-  // Describes the start, limit, and stride on every dimension of the operand
-  // being sliced.
-  struct SliceRanges {
-    std::vector<int64> starts;
-    std::vector<int64> limits;
-    std::vector<int64> strides;
-  };
-
-  // Types of attributes.
-  enum class AttrTy {
-    kInt64,
-    kFloat,
-    kBracedInt64List,
-    kHloComputation,
-    kWindow,
-    kConvolutionDimensionNumbers,
-    kSharding,
-    kInstructionList,
-    kSliceRanges,
-    kPaddingConfig,
-  };
-
-  struct AttrConfig {
-    bool required;     // whether it's required or optional
-    AttrTy attr_type;  // what type it is
-    void* result;      // where to store the parsed result.
-  };
-
-  // Parses attributes given names and configs of the attributes. Each parsed
-  // result is passed back through the result pointer in corresponding
-  // AttrConfig. Note that the result pointer must point to a optional<T> typed
-  // variable which outlives this function. Returns false on error. You should
-  // not use the any of the results if this function failed.
-  //
-  // Example usage:
-  //
-  //  std::unordered_map<string, AttrConfig> attrs;
-  //  optional<int64> foo;
-  //  attrs["foo"] = {/*required=*/false, AttrTy::kInt64, &foo};
-  //  optional<Window> bar;
-  //  attrs["bar"] = {/*required=*/true, AttrTy::kWindow, &bar};
-  //  if (!ParseAttribute(attrs)) {
-  //    return false; // Do not use 'foo' 'bar' if failed.
-  //  }
-  //  // Do something with 'bar'.
-  //  if (foo) { // If attr foo is seen, do something with 'foo'. }
-  //
-  bool ParseAttributes(const std::unordered_map<string, AttrConfig>& attrs);
-
-  // Parses a name and finds the corresponding hlo computation.
-  bool ParseComputationName(HloComputation** value);
-  // Parses a list of names and finds the corresponding hlo instructions.
-  bool ParseInstructionNames(std::vector<HloInstruction*>* instructions);
-  bool ParseWindow(Window* window);
-  bool ParseConvolutionDimensionNumbers(ConvolutionDimensionNumbers* dnums);
-  bool ParsePaddingConfig(PaddingConfig* padding);
-  bool ParseSharding(OpSharding* sharding);
-  bool ParseSingleSharding(OpSharding* sharding, bool lbrace_pre_lexed);
-
-  // Parses a sub-attribute of the window attribute, e.g.,size=1x2x3.
-  bool ParseDxD(const string& name, std::vector<int64>* result);
-  // Parses window's pad sub-attriute, e.g., pad=0_0x3x3.
-  bool ParseWindowPad(std::vector<std::vector<int64>>* pad);
-
-  bool ParseSliceRanges(SliceRanges* result);
-  bool ParseInt64List(const TokKind start, const TokKind end,
-                      const TokKind delim, std::vector<int64>* result);
+  template <typename T>
+  bool ParseExtraAttribute(T* value, const string& expected_attribute);
+  template <typename T>
+  bool ParseAttributeValue(T* value);
 
   bool ParseParamList();
   bool ParseName(string* result);
@@ -279,7 +214,7 @@ bool HloParser::ParseInstructionList(HloComputation::Builder* builder,
                     "expects '}' at the end of instruction list.");
 }
 
-// instruction ::= ('ROOT')? name '=' shape opcode operands (attribute)*
+// instruction ::= ('ROOT')? name '=' shape opcode operands (extra_attribute)*
 bool HloParser::ParseInstruction(HloComputation::Builder* builder,
                                  string* root_name) {
   string name;
@@ -295,15 +230,6 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
   if (is_root) {
     *root_name = name;
   }
-
-  // Add optional attributes.
-  std::unordered_map<string, AttrConfig> attrs;
-  optional<OpSharding> sharding;
-  attrs["sharding"] = {/*required=*/false, AttrTy::kSharding, &sharding};
-  optional<std::vector<HloInstruction*>> predecessors;
-  attrs["control-predecessors"] = {/*required=*/false, AttrTy::kInstructionList,
-                                   &predecessors};
-
   HloInstruction* instruction;
   switch (opcode) {
     case HloOpcode::kParameter: {
@@ -311,8 +237,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       if (!ParseToken(TokKind::kLparen,
                       "expects '(' before parameter number") ||
           !ParseInt64(&parameter_number) ||
-          !ParseToken(TokKind::kRparen, "expects ')' after parameter number") ||
-          !ParseAttributes(attrs)) {
+          !ParseToken(TokKind::kRparen, "expects ')' after parameter number")) {
         return false;
       }
       instruction = builder->AddInstruction(
@@ -324,8 +249,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       if (!ParseToken(TokKind::kLparen,
                       "expects '(' before constant literal") ||
           !ParseLiteral(&literal, shape) ||
-          !ParseToken(TokKind::kRparen, "expects ')' after constant literal") ||
-          !ParseAttributes(attrs)) {
+          !ParseToken(TokKind::kRparen, "expects ')' after constant literal")) {
         return false;
       }
       instruction = builder->AddInstruction(
@@ -351,8 +275,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     case HloOpcode::kSin:
     case HloOpcode::kSort:
     case HloOpcode::kTanh: {
-      if (!ParseOperands(&operands, /*expected_size=*/1) ||
-          !ParseAttributes(attrs)) {
+      if (!ParseOperands(&operands, /*expected_size=*/1)) {
         return false;
       }
       instruction = builder->AddInstruction(
@@ -382,8 +305,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     case HloOpcode::kShiftLeft:
     case HloOpcode::kShiftRightArithmetic:
     case HloOpcode::kShiftRightLogical: {
-      if (!ParseOperands(&operands, /*expected_size=*/2) ||
-          !ParseAttributes(attrs)) {
+      if (!ParseOperands(&operands, /*expected_size=*/2)) {
         return false;
       }
       instruction = builder->AddInstruction(HloInstruction::CreateBinary(
@@ -393,8 +315,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     // Ternary ops.
     case HloOpcode::kClamp:
     case HloOpcode::kSelect: {
-      if (!ParseOperands(&operands, /*expected_size=*/3) ||
-          !ParseAttributes(attrs)) {
+      if (!ParseOperands(&operands, /*expected_size=*/3)) {
         return false;
       }
       instruction = builder->AddInstruction(HloInstruction::CreateTernary(
@@ -403,8 +324,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     }
     // Other supported ops.
     case HloOpcode::kConvert: {
-      if (!ParseOperands(&operands, /*expected_size=*/1) ||
-          !ParseAttributes(attrs)) {
+      if (!ParseOperands(&operands, /*expected_size=*/1)) {
         return false;
       }
       instruction = builder->AddInstruction(
@@ -412,8 +332,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kCrossReplicaSum: {
-      if (!ParseOperands(&operands, /*expected_size=*/1) ||
-          !ParseAttributes(attrs)) {
+      if (!ParseOperands(&operands, /*expected_size=*/1)) {
         return false;
       }
       instruction = builder->AddInstruction(
@@ -421,8 +340,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kReshape: {
-      if (!ParseOperands(&operands, /*expected_size=*/1) ||
-          !ParseAttributes(attrs)) {
+      if (!ParseOperands(&operands, /*expected_size=*/1)) {
         return false;
       }
       instruction = builder->AddInstruction(
@@ -430,7 +348,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kTuple: {
-      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
+      if (!ParseOperands(&operands)) {
         return false;
       }
       instruction =
@@ -438,376 +356,126 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kWhile: {
-      optional<HloComputation*> condition;
-      optional<HloComputation*> body;
-      attrs["condition"] = {/*required=*/true, AttrTy::kHloComputation,
-                            &condition};
-      attrs["body"] = {/*required=*/true, AttrTy::kHloComputation, &body};
+      HloComputation* condition;
+      HloComputation* body;
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
-          !ParseAttributes(attrs)) {
+          !ParseExtraAttribute(&condition,
+                               /*expected_attribute=*/"condition") ||
+          !ParseExtraAttribute(&body, /*expected_attribute=*/"body")) {
         return false;
       }
       instruction = builder->AddInstruction(HloInstruction::CreateWhile(
-          shape, *condition, *body, /*init=*/operands[0]));
+          shape, condition, body, /*init=*/operands[0]));
       break;
     }
     case HloOpcode::kRecv: {
-      optional<int64> channel_id;
-      attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
+      int64 channel_id;
       if (!ParseOperands(&operands, /*expected_size=*/0) ||
-          !ParseAttributes(attrs)) {
+          !ParseExtraAttribute(&channel_id,
+                               /*expected_attribute=*/"channel_id")) {
         return false;
       }
       instruction = builder->AddInstruction(
-          HloInstruction::CreateRecv(shape.tuple_shapes(0), *channel_id));
-      break;
-    }
-    case HloOpcode::kRecvDone: {
-      optional<int64> channel_id;
-      attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
-      if (!ParseOperands(&operands, /*expected_size=*/1) ||
-          !ParseAttributes(attrs)) {
-        return false;
-      }
-      if (channel_id != operands[0]->channel_id()) {
-        return false;
-      }
-      instruction =
-          builder->AddInstruction(HloInstruction::CreateRecvDone(operands[0]));
+          HloInstruction::CreateRecv(shape, channel_id));
       break;
     }
     case HloOpcode::kSend: {
-      optional<int64> channel_id;
-      attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
+      int64 channel_id;
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
-          !ParseAttributes(attrs)) {
+          !ParseExtraAttribute(&channel_id,
+                               /*expected_attribute=*/"channel_id")) {
         return false;
       }
       instruction = builder->AddInstruction(
-          HloInstruction::CreateSend(operands[0], *channel_id));
-      break;
-    }
-    case HloOpcode::kSendDone: {
-      optional<int64> channel_id;
-      attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
-      if (!ParseOperands(&operands, /*expected_size=*/1) ||
-          !ParseAttributes(attrs)) {
-        return false;
-      }
-      if (channel_id != operands[0]->channel_id()) {
-        return false;
-      }
-      instruction =
-          builder->AddInstruction(HloInstruction::CreateSendDone(operands[0]));
+          HloInstruction::CreateSend(operands[0], channel_id));
       break;
     }
     case HloOpcode::kGetTupleElement: {
-      optional<int64> index;
-      attrs["index"] = {/*required=*/true, AttrTy::kInt64, &index};
+      int64 index;
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
-          !ParseAttributes(attrs)) {
+          !ParseExtraAttribute(&index, /*expected_attribute=*/"index")) {
         return false;
       }
       instruction = builder->AddInstruction(
-          HloInstruction::CreateGetTupleElement(shape, operands[0], *index));
+          HloInstruction::CreateGetTupleElement(shape, operands[0], index));
       break;
     }
     case HloOpcode::kCall: {
-      optional<HloComputation*> to_apply;
-      attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
-                           &to_apply};
-      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
-        return false;
-      }
-      instruction = builder->AddInstruction(
-          HloInstruction::CreateCall(shape, operands, *to_apply));
-      break;
-    }
-    case HloOpcode::kReduceWindow: {
-      optional<HloComputation*> reduce_computation;
-      optional<Window> window;
-      attrs["window"] = {/*required=*/true, AttrTy::kWindow, &window};
-      attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
-                           &reduce_computation};
-      if (!ParseOperands(&operands, /*expected_size=*/2) ||
-          !ParseAttributes(attrs)) {
-        return false;
-      }
-      instruction = builder->AddInstruction(HloInstruction::CreateReduceWindow(
-          shape, /*operand=*/operands[0], /*init_value=*/operands[1], *window,
-          *reduce_computation));
-      break;
-    }
-    case HloOpcode::kConvolution: {
-      optional<Window> window;
-      optional<ConvolutionDimensionNumbers> dnums;
-      attrs["window"] = {/*required=*/true, AttrTy::kWindow, &window};
-      attrs["dim_labels"] = {/*required=*/true,
-                             AttrTy::kConvolutionDimensionNumbers, &dnums};
-      if (!ParseOperands(&operands, /*expected_size=*/2) ||
-          !ParseAttributes(attrs)) {
-        return false;
-      }
-      instruction = builder->AddInstruction(HloInstruction::CreateConvolve(
-          shape, /*lhs=*/operands[0], /*rhs=*/operands[1], *window, *dnums));
-      break;
-    }
-    case HloOpcode::kBroadcast: {
-      optional<std::vector<int64>> broadcast_dimensions;
-      attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
-                             &broadcast_dimensions};
-      if (!ParseOperands(&operands, /*expected_size=*/1) ||
-          !ParseAttributes(attrs)) {
-        return false;
-      }
-      instruction = builder->AddInstruction(HloInstruction::CreateBroadcast(
-          shape, operands[0], *broadcast_dimensions));
-      break;
-    }
-    case HloOpcode::kConcatenate: {
-      optional<std::vector<int64>> dimensions;
-      attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
-                             &dimensions};
-      if (!ParseOperands(&operands) || !ParseAttributes(attrs) ||
-          dimensions->size() != 1) {
-        return false;
-      }
-      instruction = builder->AddInstruction(HloInstruction::CreateConcatenate(
-          shape, operands, dimensions->at(0)));
-      break;
-    }
-    case HloOpcode::kMap: {
-      optional<HloComputation*> to_apply;
-      attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
-                           &to_apply};
-      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
-        return false;
-      }
-      instruction = builder->AddInstruction(
-          HloInstruction::CreateMap(shape, operands, *to_apply));
-      break;
-    }
-    case HloOpcode::kReduce: {
-      optional<HloComputation*> reduce_computation;
-      attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
-                           &reduce_computation};
-      optional<std::vector<int64>> dimensions_to_reduce;
-      attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
-                             &dimensions_to_reduce};
-      if (!ParseOperands(&operands, /*expected_size=*/2) ||
-          !ParseAttributes(attrs)) {
-        return false;
-      }
-      instruction = builder->AddInstruction(HloInstruction::CreateReduce(
-          shape, /*operand=*/operands[0], /*init_value=*/operands[1],
-          *dimensions_to_reduce, *reduce_computation));
-      break;
-    }
-    case HloOpcode::kReverse: {
-      optional<std::vector<int64>> dimensions;
-      attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
-                             &dimensions};
-      if (!ParseOperands(&operands, /*expected_size=*/1) ||
-          !ParseAttributes(attrs)) {
-        return false;
-      }
-      instruction = builder->AddInstruction(
-          HloInstruction::CreateReverse(shape, operands[0], *dimensions));
-      break;
-    }
-    case HloOpcode::kSelectAndScatter: {
-      optional<HloComputation*> select;
-      attrs["select"] = {/*required=*/true, AttrTy::kHloComputation, &select};
-      optional<HloComputation*> scatter;
-      attrs["scatter"] = {/*required=*/true, AttrTy::kHloComputation, &scatter};
-      optional<Window> window;
-      attrs["window"] = {/*required=*/true, AttrTy::kWindow, &window};
-      if (!ParseOperands(&operands, /*expected_size=*/3) ||
-          !ParseAttributes(attrs)) {
-        return false;
-      }
-      instruction =
-          builder->AddInstruction(HloInstruction::CreateSelectAndScatter(
-              shape, /*operand=*/operands[0], *select, *window,
-              /*source=*/operands[1], /*init_value=*/operands[2], *scatter));
-      break;
-    }
-    case HloOpcode::kSlice: {
-      optional<SliceRanges> slice_ranges;
-      attrs["slice"] = {/*required=*/true, AttrTy::kSliceRanges, &slice_ranges};
-      if (!ParseOperands(&operands, /*expected_size=*/1) ||
-          !ParseAttributes(attrs)) {
-        return false;
-      }
-      instruction = builder->AddInstruction(HloInstruction::CreateSlice(
-          shape, operands[0], slice_ranges->starts, slice_ranges->limits,
-          slice_ranges->strides));
-      break;
-    }
-    case HloOpcode::kDynamicSlice: {
-      optional<std::vector<int64>> dynamic_slice_sizes;
-      attrs["dynamic_slice_sizes"] = {
-          /*required=*/true, AttrTy::kBracedInt64List, &dynamic_slice_sizes};
-      if (!ParseOperands(&operands, /*expected_size=*/2) ||
-          !ParseAttributes(attrs)) {
-        return false;
-      }
-      instruction = builder->AddInstruction(HloInstruction::CreateDynamicSlice(
-          shape, /*operand=*/operands[0], /*start_indices=*/operands[1],
-          *dynamic_slice_sizes));
-      break;
-    }
-    case HloOpcode::kDynamicUpdateSlice: {
-      if (!ParseOperands(&operands, /*expected_size=*/3) ||
-          !ParseAttributes(attrs)) {
-        return false;
-      }
-      instruction =
-          builder->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-              shape, /*operand=*/operands[0], /*update=*/operands[1],
-              /*start_indices=*/operands[2]));
-      break;
-    }
-    case HloOpcode::kTranspose: {
-      optional<std::vector<int64>> dimensions;
-      attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
-                             &dimensions};
-      if (!ParseOperands(&operands, /*expected_size=*/1) ||
-          !ParseAttributes(attrs)) {
+      HloComputation* to_apply;
+      if (!ParseOperands(&operands) ||
+          !ParseExtraAttribute(&to_apply,
+                               /*expected_attribute=*/"to_apply")) {
         return false;
       }
       instruction = builder->AddInstruction(
-          HloInstruction::CreateTranspose(shape, operands[0], *dimensions));
-      break;
-    }
-    case HloOpcode::kBatchNormTraining: {
-      optional<float> epsilon;
-      attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon};
-      optional<int64> feature_index;
-      attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64,
-                                &feature_index};
-      if (!ParseOperands(&operands, /*expected_size=*/3) ||
-          !ParseAttributes(attrs)) {
-        return false;
-      }
-      instruction =
-          builder->AddInstruction(HloInstruction::CreateBatchNormTraining(
-              shape, /*operand=*/operands[0], /*scale=*/operands[1],
-              /*offset=*/operands[2], *epsilon, *feature_index));
-      break;
-    }
-    case HloOpcode::kBatchNormInference: {
-      optional<float> epsilon;
-      attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon};
-      optional<int64> feature_index;
-      attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64,
-                                &feature_index};
-      if (!ParseOperands(&operands, /*expected_size=*/5) ||
-          !ParseAttributes(attrs)) {
-        return false;
-      }
-      instruction =
-          builder->AddInstruction(HloInstruction::CreateBatchNormInference(
-              shape, /*operand=*/operands[0], /*scale=*/operands[1],
-              /*offset=*/operands[2], /*mean=*/operands[3],
-              /*variance=*/operands[4], *epsilon, *feature_index));
-      break;
-    }
-    case HloOpcode::kBatchNormGrad: {
-      optional<float> epsilon;
-      attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon};
-      optional<int64> feature_index;
-      attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64,
-                                &feature_index};
-      if (!ParseOperands(&operands, /*expected_size=*/5) ||
-          !ParseAttributes(attrs)) {
-        return false;
-      }
-      instruction = builder->AddInstruction(HloInstruction::CreateBatchNormGrad(
-          shape, /*operand=*/operands[0], /*scale=*/operands[1],
-          /*mean=*/operands[2], /*variance=*/operands[3],
-          /*grad_output=*/operands[4], *epsilon, *feature_index));
-      break;
-    }
-    case HloOpcode::kPad: {
-      optional<PaddingConfig> padding;
-      attrs["padding"] = {/*required=*/true, AttrTy::kPaddingConfig, &padding};
-      if (!ParseOperands(&operands, /*expected_size=*/2) ||
-          !ParseAttributes(attrs)) {
-        return false;
-      }
-      instruction = builder->AddInstruction(HloInstruction::CreatePad(
-          shape, operands[0], /*padding_value=*/operands[1], *padding));
+          HloInstruction::CreateCall(shape, operands, to_apply));
       break;
     }
+    case HloOpcode::kBroadcast:
     case HloOpcode::kCustomCall:
+    case HloOpcode::kConcatenate:
     case HloOpcode::kReducePrecision:
+    case HloOpcode::kConvolution:
+    case HloOpcode::kMap:
+    case HloOpcode::kPad:
+    case HloOpcode::kReduce:
+    case HloOpcode::kReduceWindow:
+    case HloOpcode::kSelectAndScatter:
+    case HloOpcode::kReverse:
     case HloOpcode::kRng:
+    case HloOpcode::kSlice:
+    case HloOpcode::kDynamicSlice:
+    case HloOpcode::kDynamicUpdateSlice:
+    case HloOpcode::kTranspose:
     case HloOpcode::kFusion:
+    case HloOpcode::kBatchNormTraining:
+    case HloOpcode::kBatchNormInference:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
+    case HloOpcode::kBatchNormGrad:
     case HloOpcode::kTrace:
       return TokenError(StrCat("parsing not yet implemented for op: ",
                                HloOpcodeString(opcode)));
   }
 
-  // Add common attrs (sharding, control predecessors) to the instruction, if
-  // they were seen.
-  if (sharding) {
-    instruction->set_sharding(
-        HloSharding::FromProto(sharding.value()).ValueOrDie());
-  }
-  if (predecessors) {
-    for (auto* pre : *predecessors) {
-      Status status = pre->AddControlDependencyTo(instruction);
-      if (!status.ok()) {
-        return TokenError(StrCat("error adding control dependency for: ", name,
-                                 " status: ", status.ToString()));
-      }
+  bool has_sharding = false;
+  bool has_control = false;
+  while (EatIfPresent(TokKind::kComma)) {
+    string attribute_name;
+    if (!ParseAttributeName(&attribute_name)) {
+      return TokenError("expects ', sharding=' or ', control-predecessors='");
     }
-  }
-  return AddInstruction(name, instruction);
-}
-
-// ::= '{' (single_sharding | tuple_sharding) '}'
-//
-// tuple_sharding ::= single_sharding* (',' single_sharding)*
-bool HloParser::ParseSharding(OpSharding* sharding) {
-  // A single sharding starts with '{' and is not followed by '{'.
-  // A tuple sharding starts with '{' and is followed by '{', or is '{''}' for
-  // an empty tuple.
-  if (!ParseToken(TokKind::kLbrace,
-                  "expected '{' to start sharding attribute")) {
-    return false;
-  }
 
-  if (lexer_.GetKind() != TokKind::kLbrace &&
-      lexer_.GetKind() != TokKind::kRbrace) {
-    return ParseSingleSharding(sharding, /*lbrace_pre_lexed=*/true);
-  }
-
-  // Tuple sharding.
-  // Allow empty tuple shardings.
-  if (lexer_.GetKind() != TokKind::kRbrace) {
-    do {
-      if (!ParseSingleSharding(sharding->add_tuple_shardings(),
-                               /*lbrace_pre_lexed=*/false)) {
+    if (attribute_name == "sharding") {
+      // Parse "sharding=".
+      if (has_sharding) {
+        return TokenError("expects at most 1 'sharding='");
+      }
+      has_sharding = true;
+      if (!ParseSharding(instruction)) {
         return false;
       }
-    } while (EatIfPresent(TokKind::kComma));
+    } else if (attribute_name == "control-predecessors") {
+      // Parse "control-predecessors"
+      if (has_control) {
+        return TokenError("expects at most 1 'control-predecessors='");
+      }
+      has_control = true;
+      if (!ParseControlPredecessors(instruction)) {
+        return false;
+      }
+    } else {
+      return TokenError(StrCat("unexpected attribute: ", attribute_name));
+    }
   }
-  sharding->set_type(OpSharding::Type::OpSharding_Type_TUPLE);
 
-  return ParseToken(TokKind::kRbrace, "expected '}' to end sharding attribute");
+  return AddInstruction(name, instruction);
 }
 
-//  ::= '{' 'replicated'? 'maximal'? ('device=' int)? shape?
-//          ('devices=' ('[' dims ']')* device_list)? '}'
-// dims ::= int_list device_list ::= int_list
-bool HloParser::ParseSingleSharding(OpSharding* sharding,
-                                    bool lbrace_pre_lexed) {
-  if (!lbrace_pre_lexed &&
-      !ParseToken(TokKind::kLbrace,
+// ::= '{' 'replicated'? 'maximal'? ('device=' int)? shape? ('devices=' ('['
+// dims ']')* device_list)? '}' dims ::= int_list device_list ::= int_list
+bool HloParser::ParseSharding(HloInstruction* instruction) {
+  if (!ParseToken(TokKind::kLbrace,
                   "expected '{' to start sharding attribute")) {
     return false;
   }
@@ -877,6 +545,7 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
     }
   }
 
+  OpSharding sharding;
   if (replicated) {
     if (!devices.empty()) {
       return TokenError(
@@ -886,7 +555,7 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
       return TokenError(
           "replicated shardings should not have any tile shape set");
     }
-    sharding->set_type(OpSharding::Type::OpSharding_Type_REPLICATED);
+    sharding.set_type(OpSharding::Type::OpSharding_Type_REPLICATED);
   } else if (maximal) {
     if (devices.size() != 1) {
       return TokenError(
@@ -895,8 +564,8 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
     if (!ShapeUtil::Equal(tile_shape, Shape())) {
       return TokenError("maximal shardings should not have any tile shape set");
     }
-    sharding->set_type(OpSharding::Type::OpSharding_Type_MAXIMAL);
-    sharding->add_tile_assignment_devices(devices[0]);
+    sharding.set_type(OpSharding::Type::OpSharding_Type_MAXIMAL);
+    sharding.add_tile_assignment_devices(devices[0]);
   } else {
     if (devices.size() <= 1) {
       return TokenError(
@@ -910,43 +579,47 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
           "non-maximal shardings must have a tile assignment list including "
           "dimensions");
     }
-    sharding->set_type(OpSharding::Type::OpSharding_Type_OTHER);
-    *sharding->mutable_tile_shape() = tile_shape;
+    sharding.set_type(OpSharding::Type::OpSharding_Type_OTHER);
+    *sharding.mutable_tile_shape() = tile_shape;
     for (int64 dim : tile_assignment_dimensions) {
-      sharding->add_tile_assignment_dimensions(dim);
+      sharding.add_tile_assignment_dimensions(dim);
     }
     for (int64 device : devices) {
-      sharding->add_tile_assignment_devices(device);
+      sharding.add_tile_assignment_devices(device);
     }
   }
 
+  instruction->set_sharding(HloSharding::FromProto(sharding).ValueOrDie());
   lexer_.Lex();
   return true;
 }
 
 // '{' name+ '}'
-bool HloParser::ParseInstructionNames(
-    std::vector<HloInstruction*>* instructions) {
+bool HloParser::ParseControlPredecessors(HloInstruction* instruction) {
   if (!ParseToken(TokKind::kLbrace,
-                  "expects '{' at the beginning of instruction name list")) {
+                  "expects '{' at the beginning of control predecessors")) {
     return false;
   }
   do {
     string name;
     if (!ParseName(&name)) {
-      return TokenError("expects a instruction name");
+      return TokenError("expects a control predecessor");
     }
-    HloInstruction* instr =
+    HloInstruction* pre =
         tensorflow::gtl::FindPtrOrNull(instruction_pool_, name);
-    if (!instr) {
+    if (!pre) {
       return TokenError(
-          Printf("instruction '%s' is not defined", name.c_str()));
+          StrCat("control predecessor ", name, " is not defined: "));
+    }
+    Status status = pre->AddControlDependencyTo(instruction);
+    if (!status.ok()) {
+      return TokenError(StrCat("error adding control dependency for: ", name,
+                               " status: ", status.ToString()));
     }
-    instructions->push_back(instr);
   } while (EatIfPresent(TokKind::kComma));
 
   return ParseToken(TokKind::kRbrace,
-                    "expects '}' at the end of control instructions");
+                    "expects '}' at the end of control predecessors");
 }
 
 bool HloParser::SetValueInLiteral(int64 value, int64 linear_index,
@@ -1284,134 +957,28 @@ bool HloParser::ParseOperands(std::vector<HloInstruction*>* operands,
   return true;
 }
 
-bool HloParser::ParseAttributes(
-    const std::unordered_map<string, AttrConfig>& attrs) {
-  std::unordered_set<string> seen_attrs;
-  while (EatIfPresent(TokKind::kComma)) {
-    string name;
-    if (!ParseAttributeName(&name)) {
-      return TokenError("error parsing attributes");
-    }
-    VLOG(1) << "Parsing attribute " << name;
-    if (!seen_attrs.insert(name).second) {
-      return TokenError(Printf("attribute %s already exists", name.c_str()));
-    }
-    auto attr_it = attrs.find(name);
-    if (attr_it == attrs.end()) {
-      return TokenError(Printf("unexpected attribute %s", name.c_str()));
-    }
-    AttrTy attr_type = attr_it->second.attr_type;
-    void* attr_out_ptr = attr_it->second.result;
-    bool success = [&] {
-      switch (attr_type) {
-        case AttrTy::kInt64: {
-          int64 result;
-          if (!ParseInt64(&result)) {
-            return false;
-          }
-          static_cast<optional<int64>*>(attr_out_ptr)->emplace(result);
-          return true;
-        }
-        case AttrTy::kFloat: {
-          double result;
-          if (!ParseDouble(&result)) {
-            return false;
-          }
-          if (result > std::numeric_limits<float>::max() ||
-              result < std::numeric_limits<float>::lowest()) {
-            return TokenError("value out of range for float");
-          }
-          static_cast<optional<float>*>(attr_out_ptr)
-              ->emplace(static_cast<float>(result));
-          return true;
-        }
-        case AttrTy::kHloComputation: {
-          HloComputation* result;
-          if (!ParseComputationName(&result)) {
-            return false;
-          }
-          static_cast<optional<HloComputation*>*>(attr_out_ptr)
-              ->emplace(result);
-          return true;
-        }
-        case AttrTy::kWindow: {
-          Window result;
-          if (!ParseWindow(&result)) {
-            return false;
-          }
-          static_cast<optional<Window>*>(attr_out_ptr)->emplace(result);
-          return true;
-        }
-        case AttrTy::kConvolutionDimensionNumbers: {
-          ConvolutionDimensionNumbers result;
-          if (!ParseConvolutionDimensionNumbers(&result)) {
-            return false;
-          }
-          static_cast<optional<ConvolutionDimensionNumbers>*>(attr_out_ptr)
-              ->emplace(result);
-          return true;
-        }
-        case AttrTy::kSharding: {
-          OpSharding sharding;
-          if (!ParseSharding(&sharding)) {
-            return false;
-          }
-          static_cast<optional<OpSharding>*>(attr_out_ptr)->emplace(sharding);
-          return true;
-        }
-        case AttrTy::kInstructionList: {
-          std::vector<HloInstruction*> result;
-          if (!ParseInstructionNames(&result)) {
-            return false;
-          }
-          static_cast<optional<std::vector<HloInstruction*>>*>(attr_out_ptr)
-              ->emplace(result);
-          return true;
-        }
-        case AttrTy::kBracedInt64List: {
-          std::vector<int64> result;
-          if (!ParseInt64List(TokKind::kLbrace, TokKind::kRbrace,
-                              TokKind::kComma, &result)) {
-            return false;
-          }
-          static_cast<optional<std::vector<int64>>*>(attr_out_ptr)
-              ->emplace(result);
-          return true;
-        }
-        case AttrTy::kSliceRanges: {
-          SliceRanges result;
-          if (!ParseSliceRanges(&result)) {
-            return false;
-          }
-          static_cast<optional<SliceRanges>*>(attr_out_ptr)->emplace(result);
-          return true;
-        }
-        case AttrTy::kPaddingConfig: {
-          PaddingConfig result;
-          if (!ParsePaddingConfig(&result)) {
-            return false;
-          }
-          static_cast<optional<PaddingConfig>*>(attr_out_ptr)->emplace(result);
-          return true;
-        }
-      }
-    }();
-    if (!success) {
-      return TokenError(Printf("error parsing attribute %s", name.c_str()));
-    }
+// extra_attribute ::= ',' attribute_name value
+template <typename T>
+bool HloParser::ParseExtraAttribute(T* value,
+                                    const string& expected_attribute) {
+  if (!ParseToken(TokKind::kComma,
+                  "expects ',' in front of an extra attribute")) {
+    return false;
   }
-  // Check that all required attrs were seen.
-  for (const auto& attr_it : attrs) {
-    if (attr_it.second.required &&
-        seen_attrs.find(attr_it.first) == seen_attrs.end()) {
-      return TokenError(Printf("attribute %s is expected but not seen",
-                               attr_it.first.c_str()));
-    }
+  string attribute_name;
+  if (!ParseAttributeName(&attribute_name) &&
+      attribute_name != expected_attribute) {
+    return TokenError(StrCat("expects attribute name: ", expected_attribute));
+  }
+  if (!ParseAttributeValue(value)) {
+    return TokenError(
+        StrCat("expects value for attribute: ", expected_attribute));
   }
   return true;
 }
 
-bool HloParser::ParseComputationName(HloComputation** value) {
+template <>
+bool HloParser::ParseAttributeValue<HloComputation*>(HloComputation** value) {
   string name;
   if (!ParseName(&name)) {
     return TokenError("expects computation name");
@@ -1423,269 +990,9 @@ bool HloParser::ParseComputationName(HloComputation** value) {
   return true;
 }
 
-// ::= '{' size stride? pad? lhs_dilate? rhs_dilate? '}'
-// The subattributes can appear in any order. 'size=' is required, others are
-// optional.
-bool HloParser::ParseWindow(Window* window) {
-  if (!ParseToken(TokKind::kLbrace, "expected '{' to start window attribute")) {
-    return false;
-  }
-
-  std::vector<int64> size;
-  std::vector<int64> stride;
-  std::vector<std::vector<int64>> pad;
-  std::vector<int64> lhs_dilate;
-  std::vector<int64> rhs_dilate;
-  while (lexer_.GetKind() != TokKind::kRbrace) {
-    string field_name;
-    if (!ParseAttributeName(&field_name)) {
-      return TokenError("expects sub-attributes in window");
-    }
-    bool ok = [&] {
-      if (field_name == "size") {
-        return ParseDxD("size", &size);
-      }
-      if (field_name == "stride") {
-        return ParseDxD("stride", &stride);
-      }
-      if (field_name == "lhs_dilate") {
-        return ParseDxD("lhs_dilate", &lhs_dilate);
-      }
-      if (field_name == "rhs_dilate") {
-        return ParseDxD("rls_dilate", &rhs_dilate);
-      }
-      if (field_name == "pad") {
-        return ParseWindowPad(&pad);
-      }
-      return TokenError(StrCat("unexpected attribute name: ", field_name));
-    }();
-    if (!ok) {
-      return false;
-    }
-  }
-
-  if (size.empty()) {
-    return TokenError(
-        "sub-attribute 'size=' is required in the window attribute");
-  }
-  if (!stride.empty() && stride.size() != size.size()) {
-    return TokenError("expects 'stride=' has the same size as 'size='");
-  }
-  if (!lhs_dilate.empty() && lhs_dilate.size() != size.size()) {
-    return TokenError("expects 'lhs_dilate=' has the same size as 'size='");
-  }
-  if (!rhs_dilate.empty() && rhs_dilate.size() != size.size()) {
-    return TokenError("expects 'rhs_dilate=' has the same size as 'size='");
-  }
-  if (!pad.empty() && pad.size() != size.size()) {
-    return TokenError("expects 'pad=' has the same size as 'size='");
-  }
-
-  for (int i = 0; i < size.size(); i++) {
-    window->add_dimensions()->set_size(size[i]);
-    if (!pad.empty()) {
-      window->mutable_dimensions(i)->set_padding_low(pad[i][0]);
-      window->mutable_dimensions(i)->set_padding_high(pad[i][1]);
-    }
-    // If some field is not present, it has the default value.
-    window->mutable_dimensions(i)->set_stride(stride.empty() ? 1 : stride[i]);
-    window->mutable_dimensions(i)->set_base_dilation(
-        lhs_dilate.empty() ? 1 : lhs_dilate[i]);
-    window->mutable_dimensions(i)->set_window_dilation(
-        rhs_dilate.empty() ? 1 : rhs_dilate[i]);
-  }
-  return ParseToken(TokKind::kRbrace, "expected '}' to end window attribute");
-}
-
-// This is the inverse of HloInstruction::ConvolutionDimensionNumbersToString.
-// The string looks like "dim_labels=0bf_0io->0bf".
-bool HloParser::ParseConvolutionDimensionNumbers(
-    ConvolutionDimensionNumbers* dnums) {
-  if (lexer_.GetKind() != TokKind::kDimLabels) {
-    return TokenError("expects dim labels pattern, e.g., 'bf0_0io->0bf'");
-  }
-  string str = lexer_.GetStrVal();
-
-  // The str is expected to have 3 items, lhs, rhs, out, and it must looks like
-  // lhs_rhs->out, that is, the first separator is "_" and the second is "->".
-  // So we replace the "->" with "_" and then split on "_".
-  str = tensorflow::str_util::StringReplace(str, /*oldsub=*/"->",
-                                            /*newsub=*/"_",
-                                            /*replace_all=*/false);
-  std::vector<string> lhs_rhs_out = Split(str, "_");
-  if (lhs_rhs_out.size() != 3) {
-    LOG(FATAL) << "expects 3 items: lhs, rhs, and output dims, but sees "
-               << str;
-  }
-
-  const int64 rank = lhs_rhs_out[0].length();
-  if (rank != lhs_rhs_out[1].length() || rank != lhs_rhs_out[2].length()) {
-    return TokenError(
-        "convolution lhs, rhs, and output must have the same rank");
-  }
-  if (rank < 3) {
-    return TokenError("convolution rank must >=3");
-  }
-
-  auto is_unique = [](string str) -> bool {
-    std::sort(str.begin(), str.end());
-    return std::unique(str.begin(), str.end()) == str.end();
-  };
-
-  // lhs
-  {
-    const string& lhs = lhs_rhs_out[0];
-    if (!is_unique(lhs)) {
-      return TokenError(
-          StrCat("expects unique lhs dimension numbers, but sees ", lhs));
-    }
-    for (int i = 0; i < rank - 2; i++) {
-      dnums->add_spatial_dimensions(-1);
-    }
-    for (int i = 0; i < rank; i++) {
-      char c = lhs[i];
-      if (c == 'b') {
-        dnums->set_input_batch_dimension(i);
-      } else if (c == 'f') {
-        dnums->set_input_feature_dimension(i);
-      } else if (c < '0' + rank && c >= '0') {
-        dnums->set_spatial_dimensions(c - '0', i);
-      } else {
-        return TokenError(
-            Printf("expects [0-%lldbf] in lhs dimension numbers", rank - 1));
-      }
-    }
-  }
-  // rhs
-  {
-    const string& rhs = lhs_rhs_out[1];
-    if (!is_unique(rhs)) {
-      return TokenError(
-          StrCat("expects unique rhs dimension numbers, but sees ", rhs));
-    }
-    for (int i = 0; i < rank - 2; i++) {
-      dnums->add_kernel_spatial_dimensions(-1);
-    }
-    for (int i = 0; i < rank; i++) {
-      char c = rhs[i];
-      if (c == 'i') {
-        dnums->set_kernel_input_feature_dimension(i);
-      } else if (c == 'o') {
-        dnums->set_kernel_output_feature_dimension(i);
-      } else if (c < '0' + rank && c >= '0') {
-        dnums->set_kernel_spatial_dimensions(c - '0', i);
-      } else {
-        return TokenError(
-            Printf("expects [0-%lldio] in rhs dimension numbers", rank - 1));
-      }
-    }
-  }
-  // output
-  {
-    const string& out = lhs_rhs_out[2];
-    if (!is_unique(out)) {
-      return TokenError(
-          StrCat("expects unique output dimension numbers, but sees ", out));
-    }
-    for (int i = 0; i < rank; i++) {
-      char c = out[i];
-      if (c == 'b') {
-        dnums->set_output_batch_dimension(i);
-      } else if (c == 'f') {
-        dnums->set_output_feature_dimension(i);
-      } else if (c < '0' + rank && c >= '0') {
-        if (dnums->spatial_dimensions(c - '0') != i) {
-          return TokenError(
-              "output spatial dimensions should be the same as input spatial "
-              "dimensions");
-        }
-      } else {
-        return TokenError(
-            Printf("expects [0-%lldbf] in output dimension numbers", rank - 1));
-      }
-    }
-  }
-
-  lexer_.Lex();
-  return true;
-}
-
-// ::= '{' ranges '}'
-//   ::= /*empty*/
-//   ::= range (',' range)*
-// range ::= '[' start ':' limit (':' stride)? ']'
-//
-// The slice ranges are printed as:
-//
-//  {[dim0_start:dim0_limit:dim0stride], [dim1_start:dim1_limit], ...}
-//
-// This function extracts the starts, limits, and strides as 3 vectors to the
-// result. If stride is not present, stride is 1. For example, if the slice
-// ranges is printed as:
-//
-//  {[2:3:4], [5:6:7], [8:9]}
-//
-// The the parsed result will be:
-//
-//  {/*starts=*/{2, 5, 8}, /*limits=*/{3, 6, 9}, /*strides=*/{4, 7, 1}}
-//
-bool HloParser::ParseSliceRanges(SliceRanges* result) {
-  if (!ParseToken(TokKind::kLbrace, "expects '{' to start ranges")) {
-    return false;
-  }
-  std::vector<std::vector<int64>> ranges;
-  if (lexer_.GetKind() == TokKind::kRbrace) {
-    // empty
-    return ParseToken(TokKind::kRbrace, "expects '}' to end ranges");
-  }
-  do {
-    ranges.emplace_back();
-    if (!ParseInt64List(TokKind::kLsquare, TokKind::kRsquare, TokKind::kColon,
-                        &ranges.back())) {
-      return false;
-    }
-  } while (EatIfPresent(TokKind::kComma));
-
-  for (const auto& range : ranges) {
-    if (range.size() != 2 && range.size() != 3) {
-      return TokenError(Printf(
-          "expects [start:limit:step] or [start:limit], but sees %ld elements.",
-          range.size()));
-    }
-  }
-
-  for (const auto& range : ranges) {
-    result->starts.push_back(range[0]);
-    result->limits.push_back(range[1]);
-    result->strides.push_back(range.size() == 3 ? range[2] : 1);
-  }
-  return ParseToken(TokKind::kRbrace, "expects '}' to end ranges");
-}
-
-// int64list ::= start int64_elements end
-// int64_elements
-//   ::= /*empty*/
-//   ::= int64_val (delim int64_val)*
-bool HloParser::ParseInt64List(const TokKind start, const TokKind end,
-                               const TokKind delim,
-                               std::vector<int64>* result) {
-  if (!ParseToken(start, StrCat("expects an int64 list starting with ",
-                                TokKindToString(start)))) {
-    return false;
-  }
-  if (lexer_.GetKind() == end) {
-    // empty
-  } else {
-    do {
-      int64 i;
-      if (!ParseInt64(&i)) {
-        return false;
-      }
-      result->push_back(i);
-    } while (EatIfPresent(delim));
-  }
-  return ParseToken(
-      end, StrCat("expects an int64 list to end with ", TokKindToString(end)));
+template <>
+bool HloParser::ParseAttributeValue<int64>(int64* value) {
+  return ParseInt64(value);
 }
 
 // param_list ::= '(' param_list1 ')'
@@ -1763,82 +1070,6 @@ bool HloParser::ParseAttributeName(string* result) {
   return true;
 }
 
-bool HloParser::ParseDxD(const string& name, std::vector<int64>* result) {
-  if (!result->empty()) {
-    return TokenError(
-        Printf("sub-attribute '%s=' already exists", name.c_str()));
-  }
-  // 1D
-  if (lexer_.GetKind() == TokKind::kInt) {
-    int64 number;
-    if (!ParseInt64(&number)) {
-      return TokenError(Printf("expects sub-attribute '%s=i'", name.c_str()));
-    }
-    result->push_back(number);
-    return true;
-  }
-  // 2D or higher.
-  if (lexer_.GetKind() == TokKind::kDxD) {
-    string str = lexer_.GetStrVal();
-    if (!SplitAndParseAsInts(str, 'x', result)) {
-      return TokenError(
-          Printf("expects sub-attribute '%s=ixj...'", name.c_str()));
-    }
-    lexer_.Lex();
-    return true;
-  }
-  return TokenError("expects token type kInt or kDxD");
-}
-
-bool HloParser::ParseWindowPad(std::vector<std::vector<int64>>* pad) {
-  if (!pad->empty()) {
-    return TokenError("sub-attribute 'pad=' already exists");
-  }
-  if (lexer_.GetKind() != TokKind::kPad) {
-    return TokenError("expects window pad pattern, e.g., '0_0x3_3'");
-  }
-  string str = lexer_.GetStrVal();
-  std::vector<string> padding_str = Split(str, 'x');
-  for (int i = 0; i < padding_str.size(); i++) {
-    std::vector<int64> low_high;
-    if (!SplitAndParseAsInts(padding_str[i], '_', &low_high) ||
-        low_high.size() != 2) {
-      return TokenError(
-          "expects padding_low and padding_high separated by '_'");
-    }
-    pad->push_back(low_high);
-  }
-  lexer_.Lex();
-  return true;
-}
-
-// This is the inverse xla::ToString(PaddingConfig). The padding config string
-// looks like "0_0_0x3_3_1". The string is first separated by 'x', each
-// substring represents one PaddingConfigDimension. The substring is 3 (or 2)
-// numbers joined by '_'.
-bool HloParser::ParsePaddingConfig(PaddingConfig* padding) {
-  if (lexer_.GetKind() != TokKind::kPad) {
-    return TokenError("expects padding config, e.g., '0_0_0x3_3_1'");
-  }
-  string str = lexer_.GetStrVal();
-  std::vector<string> padding_str = Split(str, 'x');
-  for (const auto& padding_dim_str : padding_str) {
-    std::vector<int64> padding_dim;
-    if (!SplitAndParseAsInts(padding_dim_str, '_', &padding_dim) ||
-        (padding_dim.size() != 2 && padding_dim.size() != 3)) {
-      return TokenError(
-          "expects padding config pattern like 'low_high_interior' or "
-          "'low_high'");
-    }
-    auto* dim = padding->add_dimensions();
-    dim->set_edge_padding_low(padding_dim[0]);
-    dim->set_edge_padding_high(padding_dim[1]);
-    dim->set_interior_padding(padding_dim.size() == 3 ? padding_dim[2] : 0);
-  }
-  lexer_.Lex();
-  return true;
-}
-
 bool HloParser::ParseOpcode(HloOpcode* result) {
   VLOG(1) << "ParseOpcode";
   if (lexer_.GetKind() != TokKind::kOpcode) {
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
index d19c6e1877..359256f064 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
@@ -25,7 +25,6 @@ namespace tools {
 namespace {
 
 using tensorflow::StringPiece;
-using tensorflow::strings::StrCat;
 
 struct TestData {
   string test_name;
@@ -36,10 +35,6 @@ string TestDataToString(const ::testing::TestParamInfo<TestData>& data) {
   return data.param.test_name;
 }
 
-// For each string below, we check that:
-//  - we parse it to an HloModule successfully, and
-//  - the stringification of the resulting HloModule is equal to our original
-//    string.
 std::vector<TestData> CreateTestCases() {
   // clang-format off
   return std::vector<TestData>({
@@ -48,11 +43,10 @@ std::vector<TestData> CreateTestCases() {
 "AxpyParam",
 R"(HloModule axpy_module:
 
-ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
-  %alpha = f32[] parameter(0)
-  %broadcast = f32[2,4]{1,0} broadcast(f32[] %alpha), dimensions={}
+ENTRY %axpy.v5 (alpha: f32[2,4], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
+  %alpha = f32[2,4]{1,0} parameter(0)
   %x = f32[2,4]{1,0} parameter(1)
-  %multiply = f32[2,4]{1,0} multiply(f32[2,4]{1,0} %broadcast, f32[2,4]{1,0} %x)
+  %multiply = f32[2,4]{1,0} multiply(f32[2,4]{1,0} %alpha, f32[2,4]{1,0} %x)
   %y = f32[2,4]{1,0} parameter(2)
   ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y)
 }
@@ -157,7 +151,7 @@ ENTRY %SelectR1F32WithCmpR1F32sFromParamsSmall.v4 (v1: f32[4], v2: f32[4]) -> f3
   %v1 = f32[4]{0} parameter(0), sharding={maximal device=1}
   %v2 = f32[4]{0} parameter(1), sharding={maximal device=1}
   %greater-than = pred[4]{0} greater-than(f32[4]{0} %v1, f32[4]{0} %v2), sharding={replicated}
-  ROOT %select = f32[4]{0} select(pred[4]{0} %greater-than, f32[4]{0} %v1, f32[4]{0} %v2), sharding={}
+  ROOT %select = f32[4]{0} select(pred[4]{0} %greater-than, f32[4]{0} %v1, f32[4]{0} %v2)
 }
 
 )"
@@ -187,19 +181,6 @@ ENTRY %TupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f32[3], f
 
 )"
 },
-{
-"ShardedTupleCreate",
-R"(HloModule ShardedTupleCreate_module:
-
-ENTRY %ShardedTupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f32[3], f32[2,3]) {
-  %v1 = f32[] parameter(0)
-  %v2 = f32[3]{0} parameter(1)
-  %v3 = f32[2,3]{1,0} parameter(2)
-  ROOT %tuple = (f32[], f32[3]{0}, f32[2,3]{1,0}) tuple(f32[] %v1, f32[3]{0} %v2, f32[2,3]{1,0} %v3), sharding={{replicated}, {maximal device=0}, {replicated}}
-}
-
-)"
-},
 // int32 result = 0;
 // while (result < 5) { result = result + 1; }
 {
@@ -231,11 +212,9 @@ ENTRY %WhileWithScalarS32Result.v2 () -> s32[] {
 R"(HloModule TwoSendRecvBothWayRecvFist_module:
 
 ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
-  %recv = (f32[], u32[]) recv(), channel_id=15, sharding={maximal device=1}
-  ROOT %recv-done = f32[] recv-done((f32[], u32[]) %recv), channel_id=15, sharding={maximal device=1}
-  %constant = f32[] constant(2.1), sharding={maximal device=0}
-  %send = (f32[], u32[]) send(f32[] %constant), channel_id=16, sharding={maximal device=0}, control-predecessors={%recv}
-  %send-done = () send-done((f32[], u32[]) %send), channel_id=16, sharding={maximal device=0}
+  %recv = f32[] recv(), channel_id=15, sharding={maximal device=1}
+  ROOT %constant = f32[] constant(2.1), sharding={maximal device=0}
+  %send = () send(f32[] %constant), channel_id=16, sharding={maximal device=0}, control-predecessors={%recv}
 }
 
 )"
@@ -269,277 +248,6 @@ ENTRY %CallR0F32IdentityScalar.v2 () -> f32[] {
 }
 
 )"
-},
-// reduce window
-{
-"ReduceWindow",
-R"(HloModule R4UnitWindow_module:
-
-%add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] {
-  %lhs = f32[] parameter(0)
-  %rhs = f32[] parameter(1)
-  ROOT %add = f32[] add(f32[] %lhs, f32[] %rhs)
-}
-
-ENTRY %R4UnitWindow.v3 (operand: f32[13,12,8,15]) -> f32[13,3,8,15] {
-  %operand = f32[13,12,8,15]{0,3,2,1} parameter(0)
-  %constant = f32[] constant(0)
-  ROOT %reduce-window = f32[13,3,8,15]{0,3,2,1} reduce-window(f32[13,12,8,15]{0,3,2,1} %operand, f32[] %constant), window={size=1x1x7x1 stride=1x4x1x1 pad=0_0x0_0x3_3x0_0}, to_apply=%add_F32.v3
-}
-
-)"
-},
-// convolution
-{
-"Convolution",
-R"(HloModule Convolve1D1Window_0_module:
-
-ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] {
-  %input = f32[1,2,1]{2,1,0} parameter(0)
-  %copy = f32[1,2,1]{2,0,1} copy(f32[1,2,1]{2,1,0} %input)
-  %filter = f32[1,1,1]{2,1,0} parameter(1)
-  ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), window={size=1}, dim_labels=b0f_0io->b0f
-}
-
-)"
-},
-// reverse(constant)
-{
-"Reverse4D",
-R"(HloModule Reverse4DFloatArrayOnDim01_module:
-
-ENTRY %Reverse4DFloatArrayOnDim01.v2 () -> f32[4,3,2,1] {
-  %constant = f32[4,3,2,1]{0,1,2,3} constant(f32[4,3,2,1] { { /*i0=0*/ { /*i1=0*/ {1}, {2} }, { /*i1=1*/ {3}, {4} }, { /*i1=2*/ {5}, {6} } }, { /*i0=1*/ { /*i1=0*/ {7}, {8} }, { /*i1=1*/ {9}, {10} }, { /*i1=2*/ {11}, {12} } }, { /*i0=2*/ { /*i1=0*/ {13}, {14} }, { /*i1=1*/ {15}, {16} }, { /*i1=2*/ {17}, {18} } }, { /*i0=3*/ { /*i1=0*/ {19}, {20} }, { /*i1=1*/ {21}, {22} }, { /*i1=2*/ {23}, {24} } } })
-  ROOT %reverse = f32[4,3,2,1]{0,1,2,3} reverse(f32[4,3,2,1]{0,1,2,3} %constant), dimensions={0,1}
-}
-
-)"
-},
-// concat
-{
-"Concat",
-R"(HloModule Concat2x3With2x5_module:
-
-ENTRY %Concat2x3With2x5.v3 () -> f32[2,8] {
-  %constant = f32[2,3]{1,0} constant(f32[2,3] { { 0, 1, 2 }, { 1000, 1001, 1002 } })
-  %constant.1 = f32[2,5]{1,0} constant(f32[2,5] { { 64, 65, 66, 67, 68 }, { 1064, 1065, 1066, 1067, 1068 } })
-  ROOT %concatenate = f32[2,8]{1,0} concatenate(f32[2,3]{1,0} %constant, f32[2,5]{1,0} %constant.1), dimensions={1}
-}
-
-)"
-},
-// map
-{
-"Map",
-R"(HloModule MapBinaryAdder_module:
-
-%add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] {
-  %lhs = f32[] parameter(0)
-  %rhs = f32[] parameter(1)
-  ROOT %add = f32[] add(f32[] %lhs, f32[] %rhs)
-}
-
-ENTRY %MapBinaryAdder.v3 (param0: f32[4], param1: f32[4]) -> f32[4] {
-  %param0 = f32[4]{0} parameter(0)
-  %param1 = f32[4]{0} parameter(1)
-  ROOT %map = f32[4]{0} map(f32[4]{0} %param0, f32[4]{0} %param1), to_apply=%add_F32.v3
-}
-
-)"
-},
-// reduce
-{
-"Reduce",
-R"(HloModule ReduceR3ToR2_module:
-
-%add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] {
-  %lhs = f32[] parameter(0)
-  %rhs = f32[] parameter(1)
-  ROOT %add = f32[] add(f32[] %lhs, f32[] %rhs)
-}
-
-ENTRY %ReduceR3ToR2.v3 (input: f32[8,16,256]) -> f32[8,16] {
-  %input = f32[8,16,256]{2,1,0} parameter(0)
-  %constant = f32[] constant(0)
-  ROOT %reduce = f32[8,16]{1,0} reduce(f32[8,16,256]{2,1,0} %input, f32[] %constant), dimensions={2}, to_apply=%add_F32.v3
-}
-
-)"
-},
-// select and scatter
-{
-"SelectAndScatter",
-R"(HloModule R4F32OverlapSmall_module:
-
-%ge_F32.v3 (lhs: f32[], rhs: f32[]) -> pred[] {
-  %lhs = f32[] parameter(0)
-  %rhs = f32[] parameter(1)
-  ROOT %greater-than-or-equal-to = pred[] greater-than-or-equal-to(f32[] %lhs, f32[] %rhs)
-}
-
-%add_F32.v3 (lhs.1: f32[], rhs.1: f32[]) -> f32[] {
-  %lhs.1 = f32[] parameter(0)
-  %rhs.1 = f32[] parameter(1)
-  ROOT %add = f32[] add(f32[] %lhs.1, f32[] %rhs.1)
-}
-
-ENTRY %R4F32OverlapSmall.v4 () -> f32[4,5,1,1] {
-  %constant = f32[4,5,1,1]{3,2,1,0} constant(f32[4,5,1,1] { { /*i0=0*/ { /*i1=0*/ {7} }, { /*i1=1*/ {2} }, { /*i1=2*/ {5} }, { /*i1=3*/ {3} }, { /*i1=4*/ {8} } }, { /*i0=1*/ { /*i1=0*/ {3} }, { /*i1=1*/ {8} }, { /*i1=2*/ {9} }, { /*i1=3*/ {3} }, { /*i1=4*/ {4} } }, { /*i0=2*/ { /*i1=0*/ {1} }, { /*i1=1*/ {5} }, { /*i1=2*/ {7} }, { /*i1=3*/ {5} }, { /*i1=4*/ {6} } }, { /*i0=3*/ { /*i1=0*/ {0} }, { /*i1=1*/ {6} }, { /*i1=2*/ {2} }, { /*i1=3*/ {10} }, { /*i1=4*/ {2} } } })
-  %constant.1 = f32[2,2,1,1]{3,2,1,0} constant(f32[2,2,1,1] { { /*i0=0*/ { /*i1=0*/ {2} }, { /*i1=1*/ {6} } }, { /*i0=1*/ { /*i1=0*/ {3} }, { /*i1=1*/ {1} } } })
-  %constant.2 = f32[] constant(0)
-  ROOT %select-and-scatter = f32[4,5,1,1]{3,2,1,0} select-and-scatter(f32[4,5,1,1]{3,2,1,0} %constant, f32[2,2,1,1]{3,2,1,0} %constant.1, f32[] %constant.2), window={size=2x3x1x1 stride=2x2x1x1}, select=%ge_F32.v3, scatter=%add_F32.v3
-}
-
-)"
-},
-// slice
-{
-"Slice",
-R"(HloModule slice_module:
-
-ENTRY %slice.v2 (p0: f32[3,3,4,4]) -> f32[3,3,2,4] {
-  %p0 = f32[3,3,4,4]{3,2,1,0} parameter(0)
-  ROOT %slice = f32[3,3,2,4]{3,2,1,0} slice(f32[3,3,4,4]{3,2,1,0} %p0), slice={[0:3:1], [0:3:1], [0:4:2], [0:4:1]}
-}
-
-)"
-},
-// slice, no stride
-{
-"SliceNoStride",
-R"(HloModule Slice3x3x3_To_1x3x3_F32_module:
-
-ENTRY %Slice3x3x3_To_1x3x3_F32.v2 () -> f32[1,3,3] {
-  %constant = f32[3,3,3]{2,1,0} constant(f32[3,3,3] { { { 0, 1, 2 }, { 3, 4, 5 }, { 6, 7, 8 } }, { { 9, 10, 11 }, { 12, 13, 14 }, { 15, 16, 17 } }, { { 18, 19, 20 }, { 21, 22, 23 }, { 24, 25, 26 } } })
-  ROOT %slice = f32[1,3,3]{2,1,0} slice(f32[3,3,3]{2,1,0} %constant), slice={[0:1], [0:3], [0:3]}
-}
-
-)"
-},
-// slice R0
-{
-"SliceR0",
-R"(HloModule SliceR0_module:
-
-ENTRY %SliceR0.v2 () -> s32[] {
-  %constant = s32[] constant(1)
-  ROOT %slice = s32[] slice(s32[] %constant), slice={}
-}
-
-)"
-},
-// transpose
-{
-"Transpose",
-R"(HloModule Transpose_module:
-
-ENTRY %Transpose.v2 () -> s32[1,2,3] {
-  %constant = s32[1,2,3]{2,1,0} constant(s32[1,2,3] { { { 1, 2, 3 }, { 4, 5, 6 } } })
-  ROOT %transpose = s32[1,2,3]{2,1,0} transpose(s32[1,2,3]{2,1,0} %constant), dimensions={0,1,2}
-}
-
-)"
-},
-// Dynamic slice
-{
-"DynamicSlice",
-R"(HloModule DynamicSlice_module:
-
-ENTRY %DynamicSlice.v5 (original_parameter: s32[2,2,258], start_index: s32[1]) -> s32[2,2,258] {
-  %original_parameter = s32[2,2,258]{2,1,0} parameter(0)
-  %constant = s32[1]{0} constant({0})
-  %start_index = s32[1]{0} parameter(1)
-  %concatenate = s32[3]{0} concatenate(s32[1]{0} %constant, s32[1]{0} %constant, s32[1]{0} %start_index), dimensions={0}
-  ROOT %dynamic-slice = s32[2,2,258]{2,1,0} dynamic-slice(s32[2,2,258]{2,1,0} %original_parameter, s32[3]{0} %concatenate), dynamic_slice_sizes={2,2,258}
-}
-
-)"
-},
-// Dynamic update slice
-{
-"DynamicUpdateSlice",
-R"(HloModule DynamicUpdateSlice_module:
-
-ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_indices: s32[4]) -> s32[1,1,25,1] {
-  %input = s32[1,1,25,1]{3,2,1,0} parameter(0)
-  %update = s32[1,1,2,1]{3,2,1,0} parameter(1)
-  %start_indices = s32[4]{0} parameter(2)
-  ROOT %dynamic-update-slice = s32[1,1,25,1]{3,2,1,0} dynamic-update-slice(s32[1,1,25,1]{3,2,1,0} %input, s32[1,1,2,1]{3,2,1,0} %update, s32[4]{0} %start_indices)
-}
-
-)"
-},
-// batch norm training
-{
-"BatchNormTraining",
-R"(HloModule BasicTraining_module:
-
-ENTRY %BasicTraining.v4 () -> (f32[2,2,1,2], f32[2], f32[2]) {
-  %constant = f32[2,2,1,2]{3,2,1,0} constant(f32[2,2,1,2] { { /*i0=0*/ { /*i1=0*/ {1, 2} }, { /*i1=1*/ {3, 4} } }, { /*i0=1*/ { /*i1=0*/ {5, 6} }, { /*i1=1*/ {7, 8} } } })
-  %constant.1 = f32[2]{0} constant({2, 3})
-  %constant.2 = f32[2]{0} constant({1, 2})
-  ROOT %batch-norm-training = (f32[2,2,1,2]{3,2,1,0}, f32[2]{0}, f32[2]{0}) batch-norm-training(f32[2,2,1,2]{3,2,1,0} %constant, f32[2]{0} %constant.1, f32[2]{0} %constant.2), epsilon=0.001, feature_index=3
-}
-
-)"
-},
-// batch norm inference
-{
-"BatchNormInference",
-R"(HloModule BatchNormInference_module:
-
-ENTRY %BatchNormInference.v6 (input: f32[2,2,2,2], offset: f32[2], scale: f32[2], mean: f32[2], variance: f32[2]) -> f32[2,2,2,2] {
-  %input = f32[2,2,2,2]{3,2,1,0} parameter(0)
-  %offset = f32[2]{0} parameter(1)
-  %scale = f32[2]{0} parameter(2)
-  %mean = f32[2]{0} parameter(3)
-  %variance = f32[2]{0} parameter(4)
-  ROOT %batch-norm-inference = f32[2,2,2,2]{3,2,1,0} batch-norm-inference(f32[2,2,2,2]{3,2,1,0} %input, f32[2]{0} %offset, f32[2]{0} %scale, f32[2]{0} %mean, f32[2]{0} %variance), epsilon=0.001, feature_index=0
-}
-
-)"
-},
-// batch norm grad
-{
-"BatchNormGrad",
-R"(HloModule BatchNormGrad_module:
-
-ENTRY %BatchNormGrad.v4 (input: f32[2,2,2,2], scale: f32[2], mean: f32[2], variance: f32[2], grad_output: f32[2,2,2,2]) -> (f32[2,2,2,2], f32[2], f32[2]) {
-  %input = f32[2,2,2,2]{3,2,1,0} parameter(0)
-  %scale = f32[2]{0} parameter(1)
-  %mean = f32[2]{0} parameter(2)
-  %variance = f32[2]{0} parameter(3)
-  %grad_output = f32[2,2,2,2]{3,2,1,0} parameter(4)
-  ROOT %batch-norm-grad = (f32[2,2,2,2]{3,2,1,0}, f32[2]{0}, f32[2]{0}) batch-norm-grad(f32[2,2,2,2]{3,2,1,0} %input, f32[2]{0} %scale, f32[2]{0} %mean, f32[2]{0} %variance, f32[2,2,2,2]{3,2,1,0} %grad_output), epsilon=0.001, feature_index=0
-}
-
-)"
-},
-// pad
-{
-"Pad",
-R"(HloModule Pad1DS3Array_module:
-
-ENTRY %Pad1DS3Array.v3 () -> f32[8] {
-  %constant = f32[3]{0} constant({1, 2, 3})
-  %constant.1 = f32[] constant(0.1)
-  ROOT %pad = f32[8]{0} pad(f32[3]{0} %constant, f32[] %constant.1), padding=3_1
-}
-
-)"
-},
-// pad has interior
-{
-"PadHasInterior",
-R"(HloModule PadHasInterior_module:
-
-ENTRY %PadHasInterior.v3 (input: f32[1,25,7,7]) -> f32[1,25,17,11] {
-  %input = f32[1,25,7,7]{3,2,1,0} parameter(0)
-  %constant = f32[] constant(-5.123)
-  ROOT %pad = f32[1,25,17,11]{3,2,1,0} pad(f32[1,25,7,7]{3,2,1,0} %input, f32[] %constant), padding=0_0_0x0_0_0x2_2_1x2_2_0
-}
-
-)"
 }
   });
   // clang-format on
@@ -553,10 +261,7 @@ class HloParserTest : public ::testing::Test,
         << "'" << s << "' does not contain '" << expected << "'";
   }
 
-  // Expects "ToString(Parse(string)) == string", that is, parses the string,
-  // asserts that it succeeded, stringifies the parsed module, and checks that
-  // the it equals the original string.
-  void ExpectEqual() {
+  void ExpectSuccess() {
     const string& original = GetParam().module_string;
     auto result = Parse(original);
     TF_EXPECT_OK(result.status());
@@ -565,7 +270,7 @@ class HloParserTest : public ::testing::Test,
   }
 };
 
-TEST_P(HloParserTest, Run) { ExpectEqual(); }
+TEST_P(HloParserTest, Run) { ExpectSuccess(); }
 
 INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation, HloParserTest,
                         ::testing::ValuesIn(CreateTestCases()),
@@ -722,125 +427,6 @@ ENTRY %ConstantWithExp.v4 () -> f32[] {
   // printed as "300".
 }
 
-TEST_F(HloParserTest, AttibutesAnyOrder) {
-  const string original = R"(HloModule any_order_module:
-
-ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] {
-  %input = f32[1,2,1]{2,1,0} parameter(0)
-  %copy = f32[1,2,1]{2,0,1} copy(f32[1,2,1]{2,1,0} %input)
-  %filter = f32[1,1,1]{2,1,0} parameter(1)
-  ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), sharding={maximal device=1}, dim_labels=b0f_0io->b0f, window={pad=1_1 size=2}
-}
-
-)";
-  TF_EXPECT_OK(Parse(original).status());
-}
-
-TEST_F(HloParserTest, InvalidDimLabels) {
-  string prefix = R"(HloModule invalid_dim_labels_module:
-
-ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] {
-  %input = f32[1,2,1]{2,1,0} parameter(0)
-  %copy = f32[1,2,1]{2,0,1} copy(f32[1,2,1]{2,1,0} %input)
-  %filter = f32[1,1,1]{2,1,0} parameter(1)
-  ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), window={size=1} )";
-  string suffix = R"(
-}
-
-)";
-
-  ExpectHasSubstr(Parse(StrCat(prefix, ",dim_labels=00_01_10", suffix))
-                      .status()
-                      .error_message(),
-                  "expects dim labels pattern");
-
-  ExpectHasSubstr(Parse(StrCat(prefix, ",dim_labels=010_1100->010", suffix))
-                      .status()
-                      .error_message(),
-                  "must have the same rank");
-
-  ExpectHasSubstr(Parse(StrCat(prefix, ",dim_labels=0bf_io0->b0f", suffix))
-                      .status()
-                      .error_message(),
-                  "output spatial dimensions should be the same as input "
-                  "spatial dimensions");
-}
-
-TEST_F(HloParserTest, UnexpectedAttribute) {
-  const string original = R"(HloModule unexpected_attr_module:
-
-ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
-  %recv = (f32[], u32[]) recv(), channel_id=15
-  %recv-done = f32[] recv-done((f32[], u32[]) %recv), channel_id=15
-  ROOT %constant = f32[] constant(2.1)
-  %send = (f32[], u32[]) send(f32[] %constant), channel_id=16, calls=%recv
-  %send-done = () send-done((f32[], u32[]) %send), channel_id=16
-}
-
-)";
-  ExpectHasSubstr(Parse(original).status().error_message(),
-                  "unexpected attribute calls");
-}
-
-TEST_F(HloParserTest, MissingAttribute) {
-  const string original = R"(HloModule missing_attr_module:
-
-ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
-  %recv = (f32[], u32[]) recv(), channel_id=15
-  %recv-done = f32[] recv-done((f32[], u32[]) %recv), channel_id=15
-  ROOT %constant = f32[] constant(-2.1)
-  %send = (f32[], u32[]) send(f32[] %constant)
-  %send-done = () send-done((f32[], u32[]) %send), channel_id=16
-}
-
-)";
-  ExpectHasSubstr(Parse(original).status().error_message(),
-                  "attribute channel_id is expected but not seen");
-}
-
-TEST_F(HloParserTest, PredecessorUndefined) {
-  const string original = R"(HloModule pre_not_found_module:
-
-ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
-  %recv = (f32[], u32[]) recv(), channel_id=15
-  %recv-done = f32[] recv-done((f32[], u32[]) %recv), channel_id=15
-  ROOT %constant = f32[] constant(2.1)
-  %send = (f32[], u32[]) send(f32[] %constant), channel_id=16, control-predecessors={%done}
-  %send-done = () send-done((f32[], u32[]) %send), channel_id=16
-}
-
-)";
-  ExpectHasSubstr(Parse(original).status().error_message(),
-                  "'done' is not defined");
-}
-
-TEST_F(HloParserTest, SliceAllowOmitStride1) {
-  const string original = R"(HloModule slice_module:
-
-ENTRY %slice.v2 (p0: f32[3,3,4,4]) -> f32[3,3,2,4] {
-  %p0 = f32[3,3,4,4]{3,2,1,0} parameter(0)
-  ROOT %slice = f32[3,3,2,4]{3,2,1,0} slice(f32[3,3,4,4]{3,2,1,0} %p0), slice={[0:3], [0:3], [0:4:2], [0:4]}
-}
-
-)";
-  TF_EXPECT_OK(Parse(original).status());
-}
-
-TEST_F(HloParserTest, PaddingConfigIsNotWindowPad) {
-  const string original = R"(HloModule window_pad_module:
-
-ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] {
-  %input = f32[1,2,1]{2,1,0} parameter(0)
-  %copy = f32[1,2,1]{2,0,1} copy(f32[1,2,1]{2,1,0} %input)
-  %filter = f32[1,1,1]{2,1,0} parameter(1)
-  ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), dim_labels=b0f_0io->b0f, window={pad=1_1_0 size=1}
-}
-
-)";
-  ExpectHasSubstr(Parse(original).status().error_message(),
-                  "expects padding_low and padding_high separated by '_'");
-}
-
 }  // namespace
 }  // namespace tools
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_token.h b/tensorflow/compiler/xla/tools/parser/hlo_token.h
index 9afd2fac23..9c2069e756 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_token.h
+++ b/tensorflow/compiler/xla/tools/parser/hlo_token.h
@@ -57,9 +57,6 @@ enum class TokKind {
   // Typed tokens.
   kName,           // %foo
   kAttributeName,  // dimensions=
-  kDimLabels,      // [0-9bf]+_[0-9io]+->[0-9bf]+
-  kDxD,            // [0-9]+(x[0-9]+)+
-  kPad,            // [0-9]+_[0-9]+(_[0-9]+)?(x[0-9]+_[0-9]+(_[0-9]+)?)*
   kShape,          // f32[2,3]{1,0}
   kOpcode,         // add
   kInt,            // 42
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index 503e7d456e..89b26b8916 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -45,7 +45,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/threadpool.h"
diff --git a/tensorflow/compiler/xla/types.h b/tensorflow/compiler/xla/types.h
index 9fa4297523..3b19ca321c 100644
--- a/tensorflow/compiler/xla/types.h
+++ b/tensorflow/compiler/xla/types.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <complex>
 
 #include "third_party/eigen3/Eigen/Core"
-#include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/platform/types.h"
 
 #include <Eigen/Core>
@@ -33,8 +32,6 @@ using ::tensorflow::int16;
 using ::tensorflow::int32;
 using ::tensorflow::int64;
 
-using ::tensorflow::bfloat16;
-
 using ::tensorflow::uint8;
 using ::tensorflow::uint16;
 using ::tensorflow::uint32;
diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc
index 6f7f1479b9..23161873a0 100644
--- a/tensorflow/compiler/xla/window_util.cc
+++ b/tensorflow/compiler/xla/window_util.cc
@@ -26,8 +26,8 @@ namespace xla {
 namespace window_util {
 
 /* static */ string ToString(const WindowDimension& dim) {
-  using tensorflow::strings::StrAppend;
   using tensorflow::strings::StrCat;
+  using tensorflow::strings::StrAppend;
   string str = StrCat("(size=", dim.size());
   if (dim.stride() != 1) {
     StrAppend(&str, ",stride=", dim.stride());
@@ -49,22 +49,22 @@ namespace window_util {
 }
 
 string ToString(const Window& window) {
-  using tensorflow::strings::StrAppend;
   using tensorflow::strings::StrCat;
+  using tensorflow::strings::StrAppend;
 
   string str;
-  const auto add_field =
-      [&](const char* heading,
-          std::function<string(const WindowDimension&)> format) {
-        StrAppend(&str, heading, "=");
-        const char* prefix = "";
-        for (const auto& window_dimension : window.dimensions()) {
-          StrAppend(&str, prefix, format(window_dimension));
-          prefix = "x";
-        }
-      };
-
-  add_field("size",
+  const auto add_field = [&](
+      const char* heading,
+      std::function<string(const WindowDimension&)> format) {
+    StrAppend(&str, heading, "=");
+    const char* prefix = "";
+    for (const auto& window_dimension : window.dimensions()) {
+      StrAppend(&str, prefix, format(window_dimension));
+      prefix = "x";
+    }
+  };
+
+  add_field("window",
             [](const WindowDimension& dim) { return StrCat(dim.size()); });
   if (HasStride(window)) {
     add_field(" stride",
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index eac8f2ff07..06987e0044 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -46,12 +46,6 @@ enum PrimitiveType {
   // converted to f16 from f32 at arbirary points in the computation.
   F16 = 10;
   F32 = 11;
-
-  // Truncated 16 bit floating-point format. This is similar to IEEE's 16 bit
-  // floating-point format, but uses 1 bit for the sign, 8 bits for the exponent
-  // and 7 bits for the mantissa.
-  BF16 = 16;
-
   F64 = 12;
 
   // Complex values of fixed width.
@@ -69,8 +63,6 @@ enum PrimitiveType {
   // An opaque type used for passing context specific data to a custom
   // operation.
   OPAQUE = 14;
-
-  // Next = 17
 }
 
 // Describes the value held inside padding elements.
@@ -318,10 +310,7 @@ message LiteralProto {
   repeated double f64s = 9;
   repeated float c64s = 12;  // Stored as interleaved real, imag floats.
   repeated LiteralProto tuple_literals = 10;
-  // The F16s and BF16s are encoded in little endian byte order
-  bytes f16s = 11;
-  bytes bf16s = 13;
-  // Next = 14
+  bytes f16s = 11;  // Note: the F16s are encoded in little endian byte order
 }
 
 message WindowDimension {
@@ -836,10 +825,8 @@ message OpSharding {
     REPLICATED = 0;
     // This sharding is maximal - one device runs the entire operation.
     MAXIMAL = 1;
-    // This sharding is a tuple - only the tuple_shardings field is valid.
-    TUPLE = 2;
-    // None of the above; tile_shape and tile_assignment are both used.
-    OTHER = 3;
+    // Neither of the above; tile_shape and tile_assignment are both used.
+    OTHER = 2;
   }
   Type type = 1;
   // The shape of the sharded tile.
@@ -851,13 +838,6 @@ message OpSharding {
   // Flattened list of device IDs. The order of flattening is the same as used
   // by IndexUtil::MultiToLinearIndex(tile_assignment_shape).
   repeated int64 tile_assignment_devices = 4;
-  // If type == TUPLE, the sub-shardings, one per leaf node in the tuple shape,
-  // in pre-order. The tuple shape could be nested; here we store just a
-  // flattened list of all leaves in the tuple shape. Note that the tuple shape
-  // is not stored here; shardings do not store the shapes to which they are
-  // applied, this is inferred from the instruction this sharding gets attached
-  // to.
-  repeated OpSharding tuple_shardings = 5;
 }
 
 message OpRequest {
diff --git a/tensorflow/contrib/batching/BUILD b/tensorflow/contrib/batching/BUILD
index 8b7df4a84c..a111cfecb3 100644
--- a/tensorflow/contrib/batching/BUILD
+++ b/tensorflow/contrib/batching/BUILD
@@ -82,6 +82,7 @@ cc_library(
 tf_cc_test(
     name = "adaptive_shared_batch_scheduler_test",
     srcs = ["adaptive_shared_batch_scheduler_test.cc"],
+    tags = ["manual"],  # b/69013768
     deps = [
         ":adaptive_shared_batch_scheduler",
         "//tensorflow/contrib/batching/test_util:fake_clock_env",
diff --git a/tensorflow/contrib/batching/kernels/batch_kernels.cc b/tensorflow/contrib/batching/kernels/batch_kernels.cc
index 3b7c538fcc..6041d8c9b2 100644
--- a/tensorflow/contrib/batching/kernels/batch_kernels.cc
+++ b/tensorflow/contrib/batching/kernels/batch_kernels.cc
@@ -461,7 +461,7 @@ class BatchResource : public ResourceBase {
     return Status::OK();
   }
 
-  // Looks up the batcher queue for 'queue_name'. If it did't previously exist,
+  // Looks up the batcher queue for 'queue_name'. If it didn't previously exist,
   // creates it.
   Status LookupOrCreateBatcherQueue(const string& queue_name,
                                     BatcherQueue** queue) {
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/csiszar_divergence_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/csiszar_divergence_test.py
index 8c6a614beb..2e94b7206d 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/csiszar_divergence_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/csiszar_divergence_test.py
@@ -759,7 +759,7 @@ class CsiszarVIMCOTest(test.TestCase):
   def _csiszar_vimco_helper_grad(self, logu, delta):
     """Finite difference approximation of `grad(csiszar_vimco_helper, logu)`."""
 
-    # This code actually estimates the sum of the Jacobiab because thats what
+    # This code actually estimates the sum of the Jacobiab because that's what
     # TF's `gradients` does.
     np_log_avg_u1, np_log_sooavg_u1 = self._csiszar_vimco_helper(
         logu[..., None] + np.diag([delta]*len(logu)))
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h
index 70037d5bd8..5e316538ce 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h
@@ -33,9 +33,9 @@ template <typename ValueType, typename WeightType,
 class WeightedQuantilesBuffer {
  public:
   struct BufferEntry {
-    BufferEntry(ValueType v, WeightType w)
-        : value(std::move(v)), weight(std::move(w)) {}
-    BufferEntry() : value(), weight(0) {}
+    BufferEntry(const ValueType& v, const WeightType& w)
+        : value(v), weight(w) {}
+    BufferEntry() : value(0), weight(0) {}
 
     bool operator<(const BufferEntry& other) const {
       return kCompFn(value, other.value);
@@ -67,7 +67,7 @@ class WeightedQuantilesBuffer {
 
   // Push entry to buffer and maintain a compact representation within
   // pre-defined size limit.
-  void PushEntry(ValueType value, WeightType weight) {
+  void PushEntry(const ValueType& value, const WeightType& weight) {
     // Callers are expected to act on a full compacted buffer after the
     // PushEntry call returns.
     QCHECK(!IsFull()) << "Buffer already full: " << max_size_;
@@ -78,7 +78,7 @@ class WeightedQuantilesBuffer {
     }
 
     // Push back the entry to the buffer.
-    vec_.push_back(BufferEntry(std::move(value), std::move(weight)));
+    vec_.push_back(BufferEntry(value, weight));
   }
 
   // Returns a sorted vector view of the base buffer and clears the buffer.
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 8744fc492f..77a3fc0c83 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -34,13 +34,41 @@ option(tensorflow_BUILD_SHARED_LIB "Build TensorFlow as a shared library" OFF)
 option(tensorflow_OPTIMIZE_FOR_NATIVE_ARCH "Enable compiler optimizations for the native processor architecture (if available)" ON)
 option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions")
 option(tensorflow_ENABLE_SNAPPY_SUPPORT "Enable SNAPPY compression support" ON)
+if(HAIKU)
+	option(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE "Enable PIE support" OFF)
+else()
+	option(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE "Enable PIE support" ON)
+endif()
+
 
 if (NOT WIN32)
   # Threads: defines CMAKE_THREAD_LIBS_INIT and adds -pthread compile option
   # for targets that link ${CMAKE_THREAD_LIBS_INIT}.
   find_package (Threads)
+
+  option(tensorflow_PATH_STATIC_LIB "Additional library search path for libcudnn_static.a, libnccl_static.a, libculibos.a" /usr/local/cuda/lib64/)
+  option(tensorflow_CUDNN_INCLUDE "cudnn.h header install path" /usr/include/)
+  if (NOT tensorflow_CUDNN_INCLUDE)
+    # option's default value is OFF. Fill it with real default values
+    set(tensorflow_CUDNN_INCLUDE /usr/include)
+  endif (NOT tensorflow_CUDNN_INCLUDE)
+  option(tensorflow_PATH_CUDNN_STATIC_LIB "Override PATH_STATIC_LIB for libcudnn_static.a" ${tensorflow_PATH_STATIC_LIB})
+  option(tensorflow_PATH_NCCL_STATIC_LIB "Override PATH_STATIC_LIB for libnccl_static.a" ${tensorflow_PATH_STATIC_LIB})
+  option(tensorflow_CUDA_LIBRARY_PATH "Designate the default CUDA library paths" /usr/local/cuda/lib64)
+  if (NOT tensorflow_CUDA_LIBRARY_PATH)
+    # option's default value is OFF. Fill it with real default values
+    set(tensorflow_CUDA_LIBRARY_PATH /usr/local/cuda/lib64)
+  endif (NOT tensorflow_CUDA_LIBRARY_PATH)
 endif()
 
+if (WIN32)
+  set(BOOL_WIN32 ON)
+else (WIN32)
+  set(BOOL_WIN32 OFF)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+endif (WIN32)
+
 # [CLEANUP] Remove when done
 # For debugging
 function(SHOW_VARIABLES)
@@ -58,7 +86,12 @@ set (DOWNLOAD_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/downloads"
      CACHE PATH "Location where external projects will be downloaded.")
 mark_as_advanced(DOWNLOAD_LOCATION)
 
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+if (tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+	set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+else()
+	set(CMAKE_POSITION_INDEPENDENT_CODE OFF)
+endif()
+
 add_definitions(-DEIGEN_AVOID_STL_ARRAY)
 if(WIN32)
   add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC)
@@ -217,20 +250,35 @@ endif()
 if(UNIX)
   list(APPEND tensorflow_EXTERNAL_LIBRARIES ${CMAKE_THREAD_LIBS_INIT} ${CMAKE_DL_LIBS})
 endif()
+if(HAIKU)
+  list(APPEND tensorflow_EXTERNAL_LIBRARIES network)
+endif()
 
 if (tensorflow_ENABLE_GPU)
+  if (NOT WIN32)
+    # Default install paths for cuda libraries in Linux
+    # In some Linux distros, find_package(CUDA) seems to require CMAKE_LIBRARY_PATH to include cuda-lib paths
+    list(APPEND CMAKE_LIBRARY_PATH "${tensorflow_CUDA_LIBRARY_PATH}")
+    list(APPEND CMAKE_LIBRARY_PATH "${tensorflow_CUDA_LIBRARY_PATH}/stubs")
+  endif (NOT WIN32)
+
+  find_package(CUDA 8.0 REQUIRED)
+
+  # by default we assume compute cabability 3.5 and 5.2. If you change this change it in
+  # CUDA_NVCC_FLAGS and cuda_config.h below
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_30,code=\"sm_30,compute_30\";-gencode arch=compute_35,code=\"sm_35,compute_35\";-gencode arch=compute_52,code=\"sm_52,compute_52\")
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--include-path ${PROJECT_BINARY_DIR}/$\{build_configuration\};--expt-relaxed-constexpr)
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-ftz=true)  # Flush denormals to zero
+  set(CUDA_INCLUDE ${CUDA_TOOLKIT_TARGET_DIR} ${CUDA_TOOLKIT_TARGET_DIR}/extras/CUPTI/include)
+  include_directories(${CUDA_INCLUDE})
   if (WIN32)
-    find_package(CUDA 8.0 REQUIRED)
-
-    # by default we assume compute cabability 3.5 and 5.2. If you change this change it in
-    # CUDA_NVCC_FLAGS and cuda_config.h below
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_30,code=\"sm_30,compute_30\";-gencode arch=compute_35,code=\"sm_35,compute_35\";-gencode arch=compute_52,code=\"sm_52,compute_52\")
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--include-path ${PROJECT_BINARY_DIR}/$\{build_configuration\};--expt-relaxed-constexpr)
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-ftz=true)  # Flush denormals to zero
-    set(CUDA_INCLUDE ${CUDA_TOOLKIT_TARGET_DIR} ${CUDA_TOOLKIT_TARGET_DIR}/extras/CUPTI/include)
-    include_directories(${CUDA_INCLUDE})
     add_definitions(-DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=3.0,3.5,5.2)
+  else (WIN32)
+    # Without these double quotes, cmake in Linux makes it "-DTF_EXTRA_CUDA_CAPABILITIES=3.0, -D3.5, -D5.2" for cc, which incurs build breaks
+    add_definitions(-DGOOGLE_CUDA=1 -D"TF_EXTRA_CUDA_CAPABILITIES=3.0,3.5,5.2")
+  endif (WIN32)
 
+  if (WIN32)
     # add cudnn
     if(NOT CUDNN_HOME)
       set(CUDNN_HOME ${CUDA_TOOLKIT_TARGET_DIR})
@@ -238,18 +286,48 @@ if (tensorflow_ENABLE_GPU)
     include_directories(${CUDNN_HOME})
     set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_CUFFT_LIBRARIES}
       ${CUDA_curand_LIBRARY} ${CUDA_cupti_LIBRARY} ${CUDA_cusolver_LIBRARY} ${CUDNN_HOME}/lib/x64/cudnn.lib)
+  else (WIN32)
+    set(CUDNN_INCLUDE "${tensorflow_CUDNN_INCLUDE}")
 
-    # create cuda_config.h
-    FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
-      "#ifndef CUDA_CUDA_CONFIG_H_\n"
-      "#define CUDA_CUDA_CONFIG_H_\n"
-      "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.0\"),CudaVersion(\"3.5\"),CudaVersion(\"5.2\")\n"
-      "#define TF_CUDA_VERSION \"64_80\"\n"
-      "#define TF_CUDNN_VERSION \"64_6\"\n"
-      "#define TF_CUDA_TOOLKIT_PATH \"${CUDA_TOOLKIT_ROOT_DIR}\"\n"
-      "#endif  // CUDA_CUDA_CONFIG_H_\n"
-    )
+    find_library(nccl_STATIC_LIBRARY NAMES libnccl_static.a PATHS ${tensorflow_PATH_NCCL_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    if (NOT nccl_STATIC_LIBRARY)
+      message(FATAL_ERROR "NCCL is required for GPU-build")
+    else (NOT nccl_STATIC_LIBRARY)
+      message("nccl-static: ${nccl_STATIC_LIBRARY}")
+      # something like /usr/lib64/libnccl_static.a
+    endif (NOT nccl_STATIC_LIBRARY)
+
+    find_library(cudnn_STATIC_LIBRARY NAMES libcudnn_static.a PATHS ${tensorflow_PATH_CUDNN_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    if (NOT cudnn_STATIC_LIBRARY)
+      message(FATAL_ERROR "CUDNN is required for GPU-build")
+    else (NOT cudnn_STATIC_LIBRARY)
+      message("cudnn-static: ${cudnn_STATIC_LIBRARY}")
+    endif (NOT cudnn_STATIC_LIBRARY)
+
+    find_library(culibos_STATIC_LIBRARY NAMES libculibos.a PATHS ${tensorflow_PATH_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    if (NOT culibos_STATIC_LIBRARY)
+      message(FATAL_ERROR "CULIBOS is required for GPU-build")
+    else (NOT culibos_STATIC_LIBRARY)
+      message("culibos-static: ${culibos_STATIC_LIBRARY}")
+    endif (NOT culibos_STATIC_LIBRARY)
+
+    include_directories(${CUDNN_INCLUDE})
+    set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_CUFFT_LIBRARIES}
+      ${CUDA_curand_LIBRARY} ${CUDA_cupti_LIBRARY} ${CUDA_cusolver_LIBRARY} ${cudnn_STATIC_LIBRARY} ${culibos_STATIC_LIBRARY} ${nccl_STATIC_LIBRARY})
+  endif (WIN32)
+
+  # create cuda_config.h
+  FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
+    "#ifndef CUDA_CUDA_CONFIG_H_\n"
+    "#define CUDA_CUDA_CONFIG_H_\n"
+    "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.0\"),CudaVersion(\"3.5\"),CudaVersion(\"5.2\")\n"
+    "#define TF_CUDA_VERSION \"64_80\"\n"
+    "#define TF_CUDNN_VERSION \"64_6\"\n"
+    "#define TF_CUDA_TOOLKIT_PATH \"${CUDA_TOOLKIT_ROOT_DIR}\"\n"
+    "#endif  // CUDA_CUDA_CONFIG_H_\n"
+  )
 
+  if (WIN32)
     # tf assumes in various places header files to be in cuda/include. On windows the cuda sdk
     # installs them under cuda/version/include and to avoid that we need to change tf we copy a
     # few files to cuda/include
@@ -261,12 +339,25 @@ if (tensorflow_ENABLE_GPU)
       ${CUDA_TOOLKIT_TARGET_DIR}/include/cusolverDn.h
       DESTINATION ${tensorflow_source_dir}/third_party/gpus/cuda/include
     )
-    include_directories(${tensorflow_source_dir}/third_party/gpus)
-    # add cuda libraries to tensorflow_EXTERNAL_LIBRARIES
-    list(APPEND tensorflow_EXTERNAL_LIBRARIES ${CUDA_LIBRARIES})
+  else(WIN32)
+    # Linux has slightly differnt install paths than Windows
+    FILE(COPY
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda.h ${CUDA_TOOLKIT_TARGET_DIR}/include/cuComplex.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cublas_v2.h ${CUDNN_INCLUDE}/cudnn.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cufft.h ${CUDA_TOOLKIT_TARGET_DIR}/include/curand.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda_runtime_api.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cusolverDn.h
+      DESTINATION ${tensorflow_source_dir}/third_party/gpus/cuda/include
+    )
+  endif(WIN32)
 
-    # NOTE(mrry): Update these flags when the version of CUDA or cuDNN used
-    # in the default build is upgraded.
+  include_directories(${tensorflow_source_dir}/third_party/gpus)
+  # add cuda libraries to tensorflow_EXTERNAL_LIBRARIES
+  list(APPEND tensorflow_EXTERNAL_LIBRARIES ${CUDA_LIBRARIES})
+
+  # NOTE(mrry): Update these flags when the version of CUDA or cuDNN used
+  # in the default build is upgraded.
+  if(WIN32)
     set(tensorflow_BUILD_INFO_FLAGS --build_config cuda --key_value
       msvcp_dll_name=msvcp140.dll
       cudart_dll_name=cudart64_80.dll
@@ -275,7 +366,9 @@ if (tensorflow_ENABLE_GPU)
       cudnn_dll_name=cudnn64_6.dll
       cudnn_version_number=6)
   else(WIN32)
-    message(FATAL_ERROR "CMake GPU build is currently only supported on Windows.")
+    set(tensorflow_BUILD_INFO_FLAGS --build_config cuda --key_value
+      cuda_version_number=8.0
+      cudnn_version_number=6)
   endif(WIN32)
 else(tensorflow_ENABLE_GPU)
   set(tensorflow_BUILD_INFO_FLAGS --build_config cpu --key_value
@@ -293,9 +386,7 @@ include(tf_core_framework.cmake)
 # NOTE: Disabled until issue #3996 is fixed.
 # include(tf_stream_executor.cmake)
 if (tensorflow_ENABLE_GPU)
-  if (WIN32)
     include(tf_stream_executor.cmake)
-  endif()
 endif()
 
 include(tf_core_cpu.cmake)
diff --git a/tensorflow/contrib/cmake/external/boringssl.cmake b/tensorflow/contrib/cmake/external/boringssl.cmake
index dc27eadaca..cca8444e2a 100644
--- a/tensorflow/contrib/cmake/external/boringssl.cmake
+++ b/tensorflow/contrib/cmake/external/boringssl.cmake
@@ -39,8 +39,12 @@ ExternalProject_Add(boringssl
     # BUILD_IN_SOURCE 1
     INSTALL_COMMAND ""
     CMAKE_CACHE_ARGS
+        if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+        	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+        else()
+        	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+        endif()
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
 )
 
diff --git a/tensorflow/contrib/cmake/external/jsoncpp.cmake b/tensorflow/contrib/cmake/external/jsoncpp.cmake
index 5127d7e8f7..d2ae4c76e8 100644
--- a/tensorflow/contrib/cmake/external/jsoncpp.cmake
+++ b/tensorflow/contrib/cmake/external/jsoncpp.cmake
@@ -42,8 +42,12 @@ ExternalProject_Add(jsoncpp
     BUILD_IN_SOURCE 1
     INSTALL_COMMAND ""
     CMAKE_CACHE_ARGS
+  	  if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+  	      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+  	  else()
+   	    	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+   	 endif()
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
 )
 
diff --git a/tensorflow/contrib/cmake/external/lmdb.cmake b/tensorflow/contrib/cmake/external/lmdb.cmake
index 79971b7cfc..e41384f023 100644
--- a/tensorflow/contrib/cmake/external/lmdb.cmake
+++ b/tensorflow/contrib/cmake/external/lmdb.cmake
@@ -29,10 +29,14 @@ ExternalProject_Add(lmdb
     INSTALL_DIR ${lmdb_INSTALL}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     CMAKE_CACHE_ARGS
+		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+		else()
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+		endif()
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
         -DCMAKE_INSTALL_PREFIX:STRING=${lmdb_INSTALL}
-    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
 )
 
 if(WIN32)
diff --git a/tensorflow/contrib/cmake/external/png.cmake b/tensorflow/contrib/cmake/external/png.cmake
index 2b2bd47d1c..aad6618f52 100644
--- a/tensorflow/contrib/cmake/external/png.cmake
+++ b/tensorflow/contrib/cmake/external/png.cmake
@@ -41,10 +41,14 @@ ExternalProject_Add(png
     INSTALL_DIR ${png_INSTALL}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     CMAKE_CACHE_ARGS
+		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+		else()
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+		endif()
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
         -DCMAKE_INSTALL_PREFIX:STRING=${png_INSTALL}
-	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
 	-DZLIB_ROOT:STRING=${ZLIB_INSTALL}
 )
 
diff --git a/tensorflow/contrib/cmake/external/protobuf.cmake b/tensorflow/contrib/cmake/external/protobuf.cmake
index 1e300e21df..b53857a47b 100644
--- a/tensorflow/contrib/cmake/external/protobuf.cmake
+++ b/tensorflow/contrib/cmake/external/protobuf.cmake
@@ -44,8 +44,12 @@ ExternalProject_Add(protobuf
         ${PROTOBUF_ADDITIONAL_CMAKE_OPTIONS}
     INSTALL_COMMAND ""
     CMAKE_CACHE_ARGS
+		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+		else()
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+		endif()
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
         -DZLIB_ROOT:STRING=${ZLIB_INSTALL}
 )
diff --git a/tensorflow/contrib/cmake/external/re2.cmake b/tensorflow/contrib/cmake/external/re2.cmake
index cb4ec9c2de..b56f4b0898 100644
--- a/tensorflow/contrib/cmake/external/re2.cmake
+++ b/tensorflow/contrib/cmake/external/re2.cmake
@@ -38,7 +38,11 @@ ExternalProject_Add(re2
     BUILD_IN_SOURCE 1
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     CMAKE_CACHE_ARGS
+		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+		else()
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+		endif()
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_INSTALL_PREFIX:STRING=${re2_INSTALL}
-        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-)
-\ No newline at end of file
+)
diff --git a/tensorflow/contrib/cmake/external/snappy.cmake b/tensorflow/contrib/cmake/external/snappy.cmake
index 2d2451521c..926c271fd9 100644
--- a/tensorflow/contrib/cmake/external/snappy.cmake
+++ b/tensorflow/contrib/cmake/external/snappy.cmake
@@ -40,11 +40,15 @@ ExternalProject_Add(snappy
     LOG_CONFIGURE ON
     LOG_BUILD ON
     CMAKE_CACHE_ARGS
+		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+		else()
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+		endif()
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
         -DSNAPPY_BUILD_TESTS:BOOL=OFF
-        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
 )
 
 # actually enables snappy in the source code
-add_definitions(-DTF_USE_SNAPPY)
+add_definitions(-DTF_USE_SNAPPY)
+\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/external/sqlite.cmake b/tensorflow/contrib/cmake/external/sqlite.cmake
index 1770dcb1fd..785039a469 100644
--- a/tensorflow/contrib/cmake/external/sqlite.cmake
+++ b/tensorflow/contrib/cmake/external/sqlite.cmake
@@ -53,9 +53,13 @@ else()
         INSTALL_DIR ${sqlite_INSTALL}
         DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
         CMAKE_CACHE_ARGS
+			if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+				-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+			else()
+				-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+			endif()
             -DCMAKE_BUILD_TYPE:STRING=Release
             -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-            -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
             -DCMAKE_INSTALL_PREFIX:STRING=${sqlite_INSTALL}
     )
 
diff --git a/tensorflow/contrib/cmake/external/zlib.cmake b/tensorflow/contrib/cmake/external/zlib.cmake
index c8af611e1e..f10f84336e 100644
--- a/tensorflow/contrib/cmake/external/zlib.cmake
+++ b/tensorflow/contrib/cmake/external/zlib.cmake
@@ -42,9 +42,13 @@ ExternalProject_Add(zlib
     BUILD_IN_SOURCE 1
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     CMAKE_CACHE_ARGS
+		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+		else()
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+		endif()
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_INSTALL_PREFIX:STRING=${ZLIB_INSTALL}
-	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
 )
 
 # put zlib includes in the directory where they are expected
diff --git a/tensorflow/contrib/cmake/tf_c.cmake b/tensorflow/contrib/cmake/tf_c.cmake
index 3ae28b7601..f3882e8cf7 100644
--- a/tensorflow/contrib/cmake/tf_c.cmake
+++ b/tensorflow/contrib/cmake/tf_c.cmake
@@ -21,6 +21,7 @@ set(tf_c_srcs
     "${tensorflow_source_dir}/tensorflow/c/c_api_function.cc"
     "${tensorflow_source_dir}/tensorflow/c/eager/c_api.cc"
     "${tensorflow_source_dir}/tensorflow/c/eager/c_api.h"
+    "${tensorflow_source_dir}/tensorflow/c/eager/tape.cc"
     "${tensorflow_source_dir}/tensorflow/c/eager/tape.h"
     "${tensorflow_source_dir}/tensorflow/c/eager/runtime.cc"
     "${tensorflow_source_dir}/tensorflow/c/eager/runtime.h"
diff --git a/tensorflow/contrib/cmake/tf_cc_ops.cmake b/tensorflow/contrib/cmake/tf_cc_ops.cmake
index a5f5ae5478..f63aca4a83 100644
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@@ -148,7 +148,11 @@ list(REMOVE_ITEM tf_cc_srcs ${tf_cc_test_srcs})
 add_library(tf_cc OBJECT ${tf_cc_srcs})
 add_dependencies(tf_cc tf_cc_framework tf_cc_ops)
 
-set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib")
+if (WIN32)
+  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib")
+else (WIN32)
+  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so")
+endif (WIN32)
 add_custom_target(tf_extension_ops)
 
 function(AddUserOps)
@@ -164,15 +168,13 @@ function(AddUserOps)
   # create shared library from source and cuda obj
   add_library(${_AT_TARGET} SHARED ${_AT_SOURCES} ${gpu_lib})
   target_link_libraries(${_AT_TARGET} ${pywrap_tensorflow_lib})
-  if(WIN32)
-    if (tensorflow_ENABLE_GPU AND _AT_GPUSOURCES)
-        # some ops call out to cuda directly; need to link libs for the cuda dlls
-        target_link_libraries(${_AT_TARGET} ${CUDA_LIBRARIES})
-    endif()
-    if (_AT_DISTCOPY)
-        add_custom_command(TARGET ${_AT_TARGET} POST_BUILD
-            COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:${_AT_TARGET}> ${_AT_DISTCOPY}/)
-    endif()
+  if (tensorflow_ENABLE_GPU AND _AT_GPUSOURCES)
+      # some ops call out to cuda directly; need to link libs for the cuda dlls
+      target_link_libraries(${_AT_TARGET} ${CUDA_LIBRARIES})
+  endif()
+  if (_AT_DISTCOPY)
+      add_custom_command(TARGET ${_AT_TARGET} POST_BUILD
+          COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:${_AT_TARGET}> ${_AT_DISTCOPY}/)
   endif()
   if (_AT_DEPENDS)
     add_dependencies(${_AT_TARGET} ${_AT_DEPENDS})
@@ -180,9 +182,19 @@ function(AddUserOps)
   # make sure TF_COMPILE_LIBRARY is not defined for this target
   get_target_property(target_compile_flags  ${_AT_TARGET} COMPILE_FLAGS)
   if(target_compile_flags STREQUAL "target_compile_flags-NOTFOUND")
-    set(target_compile_flags "/UTF_COMPILE_LIBRARY")
+    if (WIN32)
+      set(target_compile_flags "/UTF_COMPILE_LIBRARY")
+    else (WIN32)
+      # gcc uses UTF as default
+      set(target_compile_flags "-finput-charset=UTF-8")
+    endif (WIN32)
   else()
-    set(target_compile_flags "${target_compile_flags} /UTF_COMPILE_LIBRARY")
+    if (WIN32)
+      set(target_compile_flags "${target_compile_flags} /UTF_COMPILE_LIBRARY")
+    else (WIN32)
+      # gcc uses UTF as default
+      set(target_compile_flags "${target_compile_flags} -finput-charset=UTF-8")
+    endif (WIN32)
   endif()
   set_target_properties(${_AT_TARGET} PROPERTIES COMPILE_FLAGS ${target_compile_flags})
   add_dependencies(tf_extension_ops ${_AT_TARGET})
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index c607546f4a..c3dc8531bb 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -301,8 +301,6 @@ file(GLOB_RECURSE tf_core_framework_srcs
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_factory.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_options.cc"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/*.cc"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/*.h"
     "${tensorflow_source_dir}/public/*.h"
 )
 
@@ -316,7 +314,6 @@ file(GLOB_RECURSE tf_core_framework_exclude_srcs
     "${tensorflow_source_dir}/tensorflow/core/util/*test*.h"
     "${tensorflow_source_dir}/tensorflow/core/util/*test*.cc"
     "${tensorflow_source_dir}/tensorflow/core/util/*main.cc"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/*test*.cc"
 )
 
 list(REMOVE_ITEM tf_core_framework_srcs ${tf_core_framework_exclude_srcs})
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index f978c8ccd5..a2ab4b9ae4 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -70,6 +70,7 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/prefetching_kernels.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/data/ops/dataset_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/data/ops/prefetching_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/clustering_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc"
@@ -183,6 +184,7 @@ file(GLOB_RECURSE tf_core_gpu_kernels_srcs
     "${tensorflow_source_dir}/tensorflow/contrib/image/kernels/*.cu.cc"
     "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/*.cu.cc"
     "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/*.cu.cc"
+    "${tensorflow_source_dir}/tensorflow/contrib/resampler/kernels/*.cu.cc"
 )
 
 if(WIN32 AND tensorflow_ENABLE_GPU)
@@ -206,16 +208,16 @@ endif(WIN32 AND tensorflow_ENABLE_GPU)
 add_library(tf_core_kernels OBJECT ${tf_core_kernels_srcs})
 add_dependencies(tf_core_kernels tf_core_cpu)
 
-if(WIN32)
+if (WIN32)
   target_compile_options(tf_core_kernels PRIVATE /MP)
-  if (tensorflow_ENABLE_GPU)
-    set_source_files_properties(${tf_core_gpu_kernels_srcs} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
-    set(tf_core_gpu_kernels_lib tf_core_gpu_kernels)
-    cuda_add_library(${tf_core_gpu_kernels_lib} ${tf_core_gpu_kernels_srcs})
-    set_target_properties(${tf_core_gpu_kernels_lib}
-                          PROPERTIES DEBUG_POSTFIX ""
-                          COMPILE_FLAGS "${TF_REGULAR_CXX_FLAGS}"
-    )
-    add_dependencies(${tf_core_gpu_kernels_lib} tf_core_cpu)
-  endif()
+endif (WIN32)
+if (tensorflow_ENABLE_GPU)
+  set_source_files_properties(${tf_core_gpu_kernels_srcs} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+  set(tf_core_gpu_kernels_lib tf_core_gpu_kernels)
+  cuda_add_library(${tf_core_gpu_kernels_lib} ${tf_core_gpu_kernels_srcs})
+  set_target_properties(${tf_core_gpu_kernels_lib}
+                        PROPERTIES DEBUG_POSTFIX ""
+                        COMPILE_FLAGS "${TF_REGULAR_CXX_FLAGS}"
+  )
+  add_dependencies(${tf_core_gpu_kernels_lib} tf_core_cpu)
 endif()
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index 4a61ed7a35..03c168795c 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -81,6 +81,7 @@ GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_prediction "${tensorflow_source_dir}/t
 GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_quantiles "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_stats_accumulator "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(cudnn_rnn "${tensorflow_source_dir}/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(data_dataset "${tensorflow_source_dir}/tensorflow/contrib/data/ops/dataset_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(data_prefetching "${tensorflow_source_dir}/tensorflow/contrib/data/ops/prefetching_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(factorization_clustering "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/clustering_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(factorization_factorization "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/factorization_ops.cc")
diff --git a/tensorflow/contrib/cmake/tf_label_image_example.cmake b/tensorflow/contrib/cmake/tf_label_image_example.cmake
index 0d3a4699eb..7f2f60b089 100644
--- a/tensorflow/contrib/cmake/tf_label_image_example.cmake
+++ b/tensorflow/contrib/cmake/tf_label_image_example.cmake
@@ -34,3 +34,8 @@ target_link_libraries(tf_label_image_example PUBLIC
     ${tf_core_gpu_kernels_lib}
     ${tensorflow_EXTERNAL_LIBRARIES}
 )
+
+install(TARGETS tf_label_image_example
+        RUNTIME DESTINATION bin
+        LIBRARY DESTINATION lib
+        ARCHIVE DESTINATION lib)
+\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 7636e9ba6e..43b98659e3 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -701,6 +701,9 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name)
       set(require_shape_fn 1)
     endif()
 
+    get_filename_component(GENERATE_PYTHON_OP_LIB_MKDIRPATH ${GENERATE_PYTHON_OP_LIB_DESTINATION} PATH)
+    file(MAKE_DIRECTORY ${GENERATE_PYTHON_OP_LIB_MKDIRPATH})
+
     # Create a C++ executable that links in the appropriate op
     # registrations and generates Python wrapper code based on the
     # registered ops.
@@ -729,6 +732,7 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name)
         ${GENERATE_PYTHON_OP_LIB_DESTINATION} PARENT_SCOPE)
 endfunction()
 
+GENERATE_PYTHON_OP_LIB("audio_ops")
 GENERATE_PYTHON_OP_LIB("array_ops")
 GENERATE_PYTHON_OP_LIB("bitwise_ops")
 GENERATE_PYTHON_OP_LIB("math_ops")
@@ -776,6 +780,8 @@ GENERATE_PYTHON_OP_LIB("contrib_boosted_trees_stats_accumulator_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/boosted_trees/python/ops/gen_stats_accumulator_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_cudnn_rnn_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/cudnn_rnn/ops/gen_cudnn_rnn_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_data_dataset_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/data/python/ops/gen_dataset_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_data_prefetching_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/data/python/ops/gen_prefetching_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_factorization_clustering_ops"
@@ -973,7 +979,7 @@ add_library(pywrap_tensorflow_internal SHARED
     $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
     $<$<BOOL:${tensorflow_ENABLE_GRPC_SUPPORT}>:$<TARGET_OBJECTS:tf_core_distributed_runtime>>
     $<TARGET_OBJECTS:tf_core_kernels>
-    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
     $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
     ${pywrap_tensorflow_deffile}
 )
@@ -1049,25 +1055,23 @@ if(WIN32)
         DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rnn/python/ops/)
 endif(WIN32)
 
-if(WIN32)
-    # include contrib/seq2seq as .so
-    #
-    set(tf_beam_search_srcs
-        "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc"
-        "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h"
-        "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/ops/beam_search_ops.cc"
-    )
+# include contrib/seq2seq as .so
+#
+set(tf_beam_search_srcs
+    "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc"
+    "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h"
+    "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/ops/beam_search_ops.cc"
+)
 
-    set(tf_beam_search_gpu_srcs
-        "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc"
-    )
+set(tf_beam_search_gpu_srcs
+    "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc"
+)
 
-    AddUserOps(TARGET _beam_search_ops
-        SOURCES "${tf_beam_search_srcs}"
-        GPUSOURCES ${tf_beam_search_gpu_srcs}
-        DEPENDS pywrap_tensorflow_internal tf_python_ops
-        DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/seq2seq/python/ops/)
-endif(WIN32)
+AddUserOps(TARGET _beam_search_ops
+    SOURCES "${tf_beam_search_srcs}"
+    GPUSOURCES ${tf_beam_search_gpu_srcs}
+    DEPENDS pywrap_tensorflow_internal tf_python_ops
+    DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/seq2seq/python/ops/)
 
 ############################################################
 # Build a PIP package containing the TensorFlow runtime.
diff --git a/tensorflow/contrib/cmake/tf_shared_lib.cmake b/tensorflow/contrib/cmake/tf_shared_lib.cmake
index 9bf45bab30..3e3fe0cdfa 100644
--- a/tensorflow/contrib/cmake/tf_shared_lib.cmake
+++ b/tensorflow/contrib/cmake/tf_shared_lib.cmake
@@ -73,7 +73,7 @@ add_library(tensorflow SHARED
     $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
     $<$<BOOL:${tensorflow_ENABLE_GRPC_SUPPORT}>:$<TARGET_OBJECTS:tf_core_distributed_runtime>>
     $<TARGET_OBJECTS:tf_core_kernels>
-    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
     $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
     ${tensorflow_deffile}
 )
@@ -94,3 +94,46 @@ endif()
 if(WIN32)
   add_dependencies(tensorflow tensorflow_static)
 endif(WIN32)
+
+install(TARGETS tensorflow
+        RUNTIME DESTINATION bin
+        LIBRARY DESTINATION lib
+        ARCHIVE DESTINATION lib)
+
+# install necessary headers
+# tensorflow headers
+install(DIRECTORY ${tensorflow_source_dir}/tensorflow/cc/
+        DESTINATION include/tensorflow/cc
+        FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tensorflow/cc/
+        DESTINATION include/tensorflow/cc
+        FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY ${tensorflow_source_dir}/tensorflow/core/
+        DESTINATION include/tensorflow/core
+        FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tensorflow/core/
+        DESTINATION include/tensorflow/core
+        FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY ${tensorflow_source_dir}/tensorflow/stream_executor/
+        DESTINATION include/tensorflow/stream_executor
+        FILES_MATCHING PATTERN "*.h")
+# google protobuf headers
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/src/google/
+        DESTINATION include/google
+        FILES_MATCHING PATTERN "*.h")
+# nsync headers
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/nsync/
+        DESTINATION include/external/nsync
+        FILES_MATCHING PATTERN "*.h")
+# Eigen directory
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/Eigen/
+        DESTINATION include/Eigen)
+# external directory
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive/
+        DESTINATION include/external/eigen_archive)
+# third_party eigen directory
+install(DIRECTORY ${tensorflow_source_dir}/third_party/eigen3/
+        DESTINATION include/third_party/eigen3)
+# unsupported Eigen directory
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/unsupported/Eigen/
+        DESTINATION include/unsupported/Eigen)
diff --git a/tensorflow/contrib/cmake/tf_stream_executor.cmake b/tensorflow/contrib/cmake/tf_stream_executor.cmake
index 3d84f1ebb9..8d95f0d3e8 100644
--- a/tensorflow/contrib/cmake/tf_stream_executor.cmake
+++ b/tensorflow/contrib/cmake/tf_stream_executor.cmake
@@ -74,6 +74,9 @@ endif()
 #)
 #list(REMOVE_ITEM tf_stream_executor_srcs ${tf_stream_executor_test_srcs}) 
 
+if (NOT WIN32)
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lgomp")
+endif (NOT WIN32)
 add_library(tf_stream_executor OBJECT ${tf_stream_executor_srcs})
 
 add_dependencies(tf_stream_executor
diff --git a/tensorflow/contrib/cmake/tf_tools.cmake b/tensorflow/contrib/cmake/tf_tools.cmake
index 6ef9598963..cb58a2e7df 100644
--- a/tensorflow/contrib/cmake/tf_tools.cmake
+++ b/tensorflow/contrib/cmake/tf_tools.cmake
@@ -73,7 +73,7 @@ add_executable(${transform_graph}
     $<TARGET_OBJECTS:tf_core_direct_session>
     $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
     $<TARGET_OBJECTS:tf_core_kernels>
-    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
     $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )
 
@@ -95,7 +95,7 @@ add_executable(${summarize_graph}
     $<TARGET_OBJECTS:tf_core_direct_session>
     $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
     $<TARGET_OBJECTS:tf_core_kernels>
-    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
     $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )
 
@@ -117,7 +117,7 @@ add_executable(${compare_graphs}
     $<TARGET_OBJECTS:tf_core_direct_session>
     $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
     $<TARGET_OBJECTS:tf_core_kernels>
-    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
     $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )
 
@@ -138,7 +138,7 @@ add_executable(${benchmark_model}
     $<TARGET_OBJECTS:tf_core_ops>
     $<TARGET_OBJECTS:tf_core_direct_session>
     $<TARGET_OBJECTS:tf_core_kernels>
-    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
     $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )
 
@@ -147,3 +147,8 @@ target_link_libraries(${benchmark_model} PUBLIC
   ${tf_core_gpu_kernels_lib}
   ${tensorflow_EXTERNAL_LIBRARIES}
 )
+
+install(TARGETS ${transform_graph} ${summarize_graph} ${compare_graphs} ${benchmark_model}
+        RUNTIME DESTINATION bin
+        LIBRARY DESTINATION lib
+        ARCHIVE DESTINATION lib)
diff --git a/tensorflow/contrib/cmake/tf_tutorials.cmake b/tensorflow/contrib/cmake/tf_tutorials.cmake
index 858e7dda92..e63fccc181 100644
--- a/tensorflow/contrib/cmake/tf_tutorials.cmake
+++ b/tensorflow/contrib/cmake/tf_tutorials.cmake
@@ -34,3 +34,8 @@ target_link_libraries(tf_tutorials_example_trainer PUBLIC
     ${tf_core_gpu_kernels_lib}
     ${tensorflow_EXTERNAL_LIBRARIES}
 )
+
+install(TARGETS tf_tutorials_example_trainer
+        RUNTIME DESTINATION bin
+        LIBRARY DESTINATION lib
+        ARCHIVE DESTINATION lib)
diff --git a/tensorflow/contrib/crf/python/ops/crf.py b/tensorflow/contrib/crf/python/ops/crf.py
index 7166e38b28..c8adb0369b 100644
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@@ -360,8 +360,8 @@ class CrfDecodeForwardRnnCell(rnn_cell.RNNCell):
       scope: Unused variable scope of this cell.
 
     Returns:
-      backpointers: [batch_size, num_tags], containing backpointers.
-      new_state: [batch_size, num_tags], containing new score values.
+      backpointers: A [batch_size, num_tags] matrix of backpointers.
+      new_state: A [batch_size, num_tags] matrix of new score values.
     """
     # For simplicity, in shape comments, denote:
     # 'batch_size' by 'B', 'max_seq_len' by 'T' , 'num_tags' by 'O' (output).
@@ -385,7 +385,7 @@ class CrfDecodeBackwardRnnCell(rnn_cell.RNNCell):
     """Initialize the CrfDecodeBackwardRnnCell.
 
     Args:
-      num_tags
+      num_tags: An integer.
     """
     self._num_tags = num_tags
 
@@ -401,8 +401,9 @@ class CrfDecodeBackwardRnnCell(rnn_cell.RNNCell):
     """Build the CrfDecodeBackwardRnnCell.
 
     Args:
-      inputs: [batch_size, num_tags], backpointer of next step (in time order).
-      state: [batch_size, 1], next position's tag index.
+      inputs: A [batch_size, num_tags] matrix of
+            backpointer of next step (in time order).
+      state: A [batch_size, 1] matrix of tag index of next step.
       scope: Unused variable scope of this cell.
 
     Returns:
@@ -426,16 +427,16 @@ def crf_decode(potentials, transition_params, sequence_length):
   This is a function for tensor.
 
   Args:
-    potentials: A [batch_size, max_seq_len, num_tags] tensor, matrix of
+    potentials: A [batch_size, max_seq_len, num_tags] tensor of
               unary potentials.
-    transition_params: A [num_tags, num_tags] tensor, matrix of
+    transition_params: A [num_tags, num_tags] matrix of
               binary potentials.
-    sequence_length: A [batch_size] tensor, containing sequence lengths.
+    sequence_length: A [batch_size] vector of true sequence lengths.
 
   Returns:
-    decode_tags: A [batch_size, max_seq_len] tensor, with dtype tf.int32.
+    decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`.
                 Contains the highest scoring tag indicies.
-    best_score: A [batch_size] tensor, containing the score of decode_tags.
+    best_score: A [batch_size] vector, containing the score of `decode_tags`.
   """
   # For simplicity, in shape comments, denote:
   # 'batch_size' by 'B', 'max_seq_len' by 'T' , 'num_tags' by 'O' (output).
diff --git a/tensorflow/contrib/data/BUILD b/tensorflow/contrib/data/BUILD
index eaede0e00e..7bcf5a5f4d 100644
--- a/tensorflow/contrib/data/BUILD
+++ b/tensorflow/contrib/data/BUILD
@@ -35,8 +35,19 @@ tf_custom_op_library(
     ],
 )
 
+# TODO(mrry): Move the kernels out of the core library into this library.
+tf_custom_op_library(
+    name = "_dataset_ops.so",
+    srcs = [
+        "ops/dataset_ops.cc",
+    ],
+)
+
 tf_gen_op_libs(
-    op_lib_names = ["prefetching_ops"],
+    op_lib_names = [
+        "dataset_ops",
+        "prefetching_ops",
+    ],
 )
 
 filegroup(
diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index 6e43ae0e63..0c7e793689 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -23,7 +23,6 @@ See the @{$datasets$Importing Data} Programmer's Guide for an overview.
 @@TextLineDataset
 
 @@batch_and_drop_remainder
-@@padded_batch_and_drop_remainder
 @@dense_to_sparse_batch
 @@enumerate_dataset
 @@group_by_window
@@ -42,11 +41,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import
 
+# pylint: disable=unused-import
 from tensorflow.contrib.data.python.ops.batching import batch_and_drop_remainder
 from tensorflow.contrib.data.python.ops.batching import dense_to_sparse_batch
-from tensorflow.contrib.data.python.ops.batching import padded_batch_and_drop_remainder
 from tensorflow.contrib.data.python.ops.batching import unbatch
 from tensorflow.contrib.data.python.ops.dataset_ops import Dataset
 from tensorflow.contrib.data.python.ops.dataset_ops import get_single_element
diff --git a/tensorflow/contrib/data/ops/dataset_ops.cc b/tensorflow/contrib/data/ops/dataset_ops.cc
new file mode 100644
index 0000000000..1574384cb2
--- /dev/null
+++ b/tensorflow/contrib/data/ops/dataset_ops.cc
@@ -0,0 +1,232 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+// --------------------------------------------------------------------------
+
+// The ops in this section can be composed to define an input
+// pipeline. Each op produces a DT_VARIANT tensor that represents
+// a DAG of "dataset" objects. An "dataset" object can be converted
+// to a stateful "iterator" by passing the "dataset" to the
+// "MakeIterator" op.
+//
+// TODO(b/65524810): DT_VARIANT tensors that represent "dataset" objects are
+// not presently serializable. To avoid issues with constant folding, ensure
+// that any "source dataset" ops (i.e. ops that output a dataset and do not
+// take one as input) are marked "stateful".
+
+REGISTER_OP("IgnoreErrorsDataset")
+    .Input("input_dataset: variant")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that contains the elements of `input_dataset` ignoring errors.
+)doc");
+
+REGISTER_OP("MapAndBatchDataset")
+    .Input("input_dataset: variant")
+    .Input("other_arguments: Targuments")
+    .Input("batch_size: int64")
+    .Input("num_parallel_batches: int64")
+    .Output("handle: variant")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that applies `f` to the outputs of `input_dataset` and then
+batches `batch_size` of them.
+
+Unlike a "MapDataset", which applies `f` sequentially, this dataset invokes up
+to `batch_size * num_parallel_batches` copies of `f` in parallel.
+
+batch_size: A scalar representing the number of elements to accumulate in a
+  batch. It determines the number of concurrent invocations of `f` that process
+  elements from `input_dataset` in parallel.
+num_parallel_batches: A scalar representing the number of batches to create in
+  parallel. Processing multiple batches in parallel benefits workloads prone to
+  stragglers.
+)doc");
+
+REGISTER_OP("ScanDataset")
+    .Input("input_dataset: variant")
+    .Input("initial_state: Tstate")
+    .Input("other_arguments: Targuments")
+    .Output("handle: variant")
+    .Attr("f: func")
+    .Attr("Tstate: list(type) >= 1")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset successively reduces `f` over the elements of `input_dataset`.
+)doc");
+
+REGISTER_OP("ParallelInterleaveDataset")
+    .Input("input_dataset: variant")
+    .Input("other_arguments: Targuments")
+    .Input("cycle_length: int64")
+    .Input("block_length: int64")
+    .Input("sloppy: bool")
+    .Output("handle: variant")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that applies `f` to the outputs of `input_dataset`.
+
+The resulting dataset is similar to the `InterleaveDataset`, with the exception
+that if retrieving the next value from a dataset would cause the requester to
+block, it will skip that input dataset. This dataset is especially useful
+when loading data from a variable-latency datastores (e.g. HDFS, GCS), as it
+allows the training step to proceed so long as some data is available.
+
+!! WARNING !! This dataset is not deterministic!
+
+f: A function mapping elements of `input_dataset`, concatenated with
+   `other_arguments`, to a Dataset variant that contains elements matching
+   `output_types` and `output_shapes`.
+)doc");
+
+REGISTER_OP("GroupByWindowDataset")
+    .Input("input_dataset: variant")
+    .Input("key_func_other_arguments: Tkey_func_other_arguments")
+    .Input("reduce_func_other_arguments: Treduce_func_other_arguments")
+    .Input(
+        "window_size_func_other_arguments: Twindow_size_func_other_arguments")
+    .Output("handle: variant")
+    .Attr("key_func: func")
+    .Attr("reduce_func: func")
+    .Attr("window_size_func: func")
+    .Attr("Tkey_func_other_arguments: list(type) >= 0")
+    .Attr("Treduce_func_other_arguments: list(type) >= 0")
+    .Attr("Twindow_size_func_other_arguments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that computes a windowed group-by on `input_dataset`.
+
+// TODO(mrry): Support non-int64 keys.
+
+key_func: A function mapping an element of `input_dataset`, concatenated
+  with `key_func_other_arguments` to a scalar value of type DT_INT64.
+)doc");
+
+REGISTER_OP("DenseToSparseBatchDataset")
+    .Input("input_dataset: variant")
+    .Input("batch_size: int64")
+    .Input("row_shape: int64")
+    .Output("handle: variant")
+    // NOTE(mrry): the 0th and 2nd elements will be DT_INT64.
+    .Attr("output_types: list(type) >= 1")
+    // NOTE(mrry): the 1st and 2nd elements will be vectors.
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that yields a SparseTensor for each element of the input.
+
+input_dataset: A handle to an input dataset. Must have a single component.
+batch_size: A scalar representing the number of elements to accumulate in a
+  batch.
+row_shape: A vector representing the dense shape of each row in the produced
+  SparseTensor. The shape may be partially specified, using `-1` to indicate
+  that a particular dimension should use the maximum size of all batch elements.
+)doc");
+
+REGISTER_OP("SqlDataset")
+    .Input("driver_name: string")
+    .Input("data_source_name: string")
+    .Input("query: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+                      // stateful to inhibit constant folding.
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that executes a SQL query and emits rows of the result set.
+
+driver_name: The database type. Currently, the only supported type is 'sqlite'.
+data_source_name: A connection string to connect to the database.
+query: A SQL query to execute.
+)doc");
+
+REGISTER_OP("DatasetToSingleElement")
+    .Input("dataset: variant")
+    .Output("components: output_types")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      std::vector<PartialTensorShape> output_shapes;
+      TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
+      if (output_shapes.size() != c->num_outputs()) {
+        return errors::InvalidArgument(
+            "`output_shapes` must be the same length as `output_types` (",
+            output_shapes.size(), " vs. ", c->num_outputs());
+      }
+      for (size_t i = 0; i < output_shapes.size(); ++i) {
+        shape_inference::ShapeHandle output_shape_handle;
+        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+            output_shapes[i], &output_shape_handle));
+        c->set_output(static_cast<int>(i), output_shape_handle);
+      }
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Outputs the single element from the given dataset.
+
+dataset: A handle to a dataset that contains a single element.
+components: The components of the single element of `input`.
+)doc");
+
+REGISTER_OP("SerializeIterator")
+    .Input("resource_handle: resource")
+    .Output("serialized: variant")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Converts the given `resource_handle` representing an iterator to a variant tensor.
+
+resource_handle: A handle to an iterator resource.
+serialized: A variant tensor storing the state of the iterator contained in the
+  resource.
+)doc");
+
+REGISTER_OP("DeserializeIterator")
+    .Input("resource_handle: resource")
+    .Input("serialized: variant")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Converts the given variant tensor to an iterator and stores it in the given resource.
+
+resource_handle: A handle to an iterator resource.
+serialized: A variant tensor storing the state of the iterator contained in the
+  resource.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 241fc2ab4f..5877f42dcf 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -365,9 +365,7 @@ py_test(
     size = "small",
     srcs = ["sequence_dataset_op_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
     deps = [
-        ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -430,9 +428,7 @@ py_test(
     size = "small",
     srcs = ["zip_dataset_op_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
     deps = [
-        ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/contrib/data/python/ops:iterator_ops",
         "//tensorflow/python:array_ops",
@@ -451,7 +447,10 @@ py_test(
     size = "small",
     srcs = ["prefetching_ops_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_oss"],  # b/68785503
+    tags = [
+        "manual",
+        "no_oss",
+    ],
     deps = [
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/contrib/data/python/ops:prefetching_py",
diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index 951d4bb5f7..670f622c3c 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -52,9 +52,8 @@ class BatchDatasetTest(test.TestCase):
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-        .repeat(count).batch(batch_size).make_initializable_iterator())
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
+                .repeat(count).batch(batch_size).make_initializable_iterator())
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -70,7 +69,7 @@ class BatchDatasetTest(test.TestCase):
         result = sess.run(get_next)
         for component, result_component in zip(components, result):
           for j in range(14):
-            self.assertAllEqual(component[(i * 14 + j) % 7]**2,
+            self.assertAllEqual(component[(i*14 + j) % 7]**2,
                                 result_component[j])
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
@@ -85,12 +84,12 @@ class BatchDatasetTest(test.TestCase):
         result = sess.run(get_next)
         for component, result_component in zip(components, result):
           for j in range(8):
-            self.assertAllEqual(component[(i * 8 + j) % 7]**2,
+            self.assertAllEqual(component[(i*8 + j) % 7]**2,
                                 result_component[j])
       result = sess.run(get_next)
       for component, result_component in zip(components, result):
         for j in range((14 * 7) % 8):
-          self.assertAllEqual(component[((num_batches - 1) * 8 + j) % 7]**2,
+          self.assertAllEqual(component[((num_batches - 1)*8 + j) % 7]**2,
                               result_component[j])
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
@@ -108,10 +107,10 @@ class BatchDatasetTest(test.TestCase):
     seq_lens = array_ops.placeholder(dtypes.int32, shape=[None])
     padded_shape = array_ops.placeholder(dtypes.int64, shape=[1])
 
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(seq_lens)
-        .map(lambda x: array_ops.fill([x], x)).padded_batch(
-            4, padded_shapes=padded_shape).make_initializable_iterator())
+    iterator = (dataset_ops.Dataset.from_tensor_slices(seq_lens)
+                .map(lambda x: array_ops.fill([x], x)).padded_batch(
+                    4,
+                    padded_shapes=padded_shape).make_initializable_iterator())
 
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -119,40 +118,35 @@ class BatchDatasetTest(test.TestCase):
     with self.test_session() as sess:
       # Test with random sequence lengths, and max padding.
       random_seq_lens = np.random.randint(20, size=(32,)).astype(np.int32)
-      sess.run(
-          init_op, feed_dict={
-              padded_shape: [-1],
-              seq_lens: random_seq_lens
-          })
+      sess.run(init_op, feed_dict={padded_shape: [-1],
+                                   seq_lens: random_seq_lens})
       for i in range(8):
         result = sess.run(get_next)
         padded_len = np.max(result)
         self.assertEqual((4, padded_len), result.shape)
         for j in range(4):
-          seq_len = random_seq_lens[(i * 4) + j]
+          seq_len = random_seq_lens[(i*4)+j]
           self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
           self.assertAllEqual(result[j, seq_len:], [0] * (padded_len - seq_len))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
       # Test with random sequence lengths, and constant padding.
-      sess.run(
-          init_op, feed_dict={
-              padded_shape: [25],
-              seq_lens: random_seq_lens
-          })
+      sess.run(init_op, feed_dict={padded_shape: [25],
+                                   seq_lens: random_seq_lens})
       for i in range(8):
         result = sess.run(get_next)
         self.assertEqual((4, 25), result.shape)
         for j in range(4):
-          seq_len = random_seq_lens[(i * 4) + j]
+          seq_len = random_seq_lens[(i*4)+j]
           self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
           self.assertAllEqual(result[j, seq_len:], [0] * (25 - seq_len))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
       # Test correct handling of empty tensors.
-      sess.run(init_op, feed_dict={padded_shape: [-1], seq_lens: [0, 0, 0, 0]})
+      sess.run(init_op, feed_dict={padded_shape: [-1],
+                                   seq_lens: [0, 0, 0, 0]})
       result = sess.run(get_next)
       self.assertAllEqual([[], [], [], []], result)
       with self.assertRaises(errors.OutOfRangeError):
@@ -160,7 +154,8 @@ class BatchDatasetTest(test.TestCase):
 
       # Test error handling with constant sequence lengths, and
       # too-short padding.
-      sess.run(init_op, feed_dict={padded_shape: [5], seq_lens: [6, 5, 5, 5]})
+      sess.run(init_op, feed_dict={padded_shape: [5],
+                                   seq_lens: [6, 5, 5, 5]})
       with self.assertRaises(errors.DataLossError):
         result = sess.run(get_next)
 
@@ -171,13 +166,11 @@ class BatchDatasetTest(test.TestCase):
     def fill_tuple(x):
       filled = array_ops.fill([x], x)
       return (filled, string_ops.as_string(filled))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(seq_lens).map(fill_tuple)
-        .padded_batch(
-            4,
-            padded_shapes=(padded_shape, padded_shape),
-            padding_values=(-1, "<end>")).make_initializable_iterator())
+    iterator = (dataset_ops.Dataset.from_tensor_slices(seq_lens).map(fill_tuple)
+                .padded_batch(
+                    4,
+                    padded_shapes=(padded_shape, padded_shape),
+                    padding_values=(-1, "<end>")).make_initializable_iterator())
 
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -185,18 +178,15 @@ class BatchDatasetTest(test.TestCase):
     with self.test_session() as sess:
       # Test with random sequence lengths, and max padding.
       random_seq_lens = np.random.randint(20, size=(32,)).astype(np.int32)
-      sess.run(
-          init_op, feed_dict={
-              padded_shape: [-1],
-              seq_lens: random_seq_lens
-          })
+      sess.run(init_op, feed_dict={padded_shape: [-1],
+                                   seq_lens: random_seq_lens})
       for i in range(8):
         result = sess.run(get_next)
         padded_len = np.max(result[0])
         self.assertEqual((4, padded_len), result[0].shape)
         self.assertEqual((4, padded_len), result[1].shape)
         for j in range(4):
-          seq_len = random_seq_lens[(i * 4) + j]
+          seq_len = random_seq_lens[(i*4)+j]
           self.assertAllEqual(result[0][j, :seq_len], [seq_len] * seq_len)
           self.assertAllEqual(result[0][j, seq_len:],
                               [-1] * (padded_len - seq_len))
@@ -230,21 +220,20 @@ class BatchDatasetTest(test.TestCase):
                        constant_op.constant([-1, -1], dtype=dtypes.int64),
                        constant_op.constant([37], dtype=dtypes.int64)))
 
-    for dataset in [
-        dynamic_padding_from_tensor_shapes, dynamic_padding_from_lists,
-        dynamic_padding_from_lists_with_minus_one, dynamic_padding_from_tensors
-    ]:
+    for dataset in [dynamic_padding_from_tensor_shapes,
+                    dynamic_padding_from_lists,
+                    dynamic_padding_from_lists_with_minus_one,
+                    dynamic_padding_from_tensors]:
       self.assertEqual([None, None], dataset.output_shapes[0].as_list())
       self.assertEqual([None, None, None], dataset.output_shapes[1].as_list())
       self.assertEqual([None, 37], dataset.output_shapes[2].as_list())
 
   def testDenseToSparseBatchDataset(self):
     components = np.random.randint(12, size=(100,)).astype(np.int32)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .map(lambda x: array_ops.fill([x], x)).apply(
-            batching.dense_to_sparse_batch(4,
-                                           [12])).make_initializable_iterator())
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
+                .map(lambda x: array_ops.fill([x], x)).apply(
+                    batching.dense_to_sparse_batch(4, [12]))
+                .make_initializable_iterator())
     init_op = iterator.initializer
     get_next = sparse_tensor.SparseTensor(*iterator.get_next())
 
@@ -253,26 +242,24 @@ class BatchDatasetTest(test.TestCase):
 
       for start in range(0, len(components), 4):
         results = sess.run(get_next)
-        self.assertAllEqual([[i, j]
-                             for i, c in enumerate(components[start:start + 4])
-                             for j in range(c)], results.indices)
         self.assertAllEqual(
-            [c for c in components[start:start + 4] for _ in range(c)],
+            [[i, j] for i, c in enumerate(components[start:start+4])
+             for j in range(c)], results.indices)
+        self.assertAllEqual(
+            [c for c in components[start:start+4] for _ in range(c)],
             results.values)
-        self.assertAllEqual([min(4,
-                                 len(components) - start), 12],
-                            results.dense_shape)
+        self.assertAllEqual(
+            [min(4, len(components) - start), 12], results.dense_shape)
 
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
   def testDenseToSparseBatchDatasetWithUnknownShape(self):
     components = np.random.randint(5, size=(40,)).astype(np.int32)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .map(lambda x: array_ops.fill([x, x], x)).apply(
-            batching.dense_to_sparse_batch(
-                4, [5, -1])).make_initializable_iterator())
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
+                .map(lambda x: array_ops.fill([x, x], x)).apply(
+                    batching.dense_to_sparse_batch(
+                        4, [5, -1])).make_initializable_iterator())
     init_op = iterator.initializer
     get_next = sparse_tensor.SparseTensor(*iterator.get_next())
 
@@ -281,30 +268,27 @@ class BatchDatasetTest(test.TestCase):
 
       for start in range(0, len(components), 4):
         results = sess.run(get_next)
-        self.assertAllEqual([[i, j, z]
-                             for i, c in enumerate(components[start:start + 4])
-                             for j in range(c)
-                             for z in range(c)], results.indices)
-        self.assertAllEqual([
-            c
-            for c in components[start:start + 4] for _ in range(c)
-            for _ in range(c)
-        ], results.values)
-        self.assertAllEqual([
-            min(4,
-                len(components) - start), 5,
-            np.max(components[start:start + 4])
-        ], results.dense_shape)
+        self.assertAllEqual(
+            [[i, j, z] for i, c in enumerate(components[start:start+4])
+             for j in range(c) for z in range(c)], results.indices)
+        self.assertAllEqual(
+            [c for c in components[start:start+4]
+             for _ in range(c) for _ in range(c)],
+            results.values)
+        self.assertAllEqual(
+            [min(4, len(components) - start),
+             5,
+             np.max(components[start:start+4])],
+            results.dense_shape)
 
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
   def testDenseToSparseBatchDatasetWithInvalidShape(self):
     input_tensor = array_ops.constant([[1]])
-    iterator = (
-        dataset_ops.Dataset.from_tensors(input_tensor).apply(
-            batching.dense_to_sparse_batch(4, [-2]))
-        .make_initializable_iterator())
+    iterator = (dataset_ops.Dataset.from_tensors(input_tensor)
+                .apply(batching.dense_to_sparse_batch(4, [-2]))
+                .make_initializable_iterator())
     init_op = iterator.initializer
 
     with self.test_session() as sess:
@@ -314,10 +298,8 @@ class BatchDatasetTest(test.TestCase):
 
   def testDenseToSparseBatchDatasetShapeErrors(self):
     input_tensor = array_ops.placeholder(dtypes.int32)
-    iterator = (
-        dataset_ops.Dataset.from_tensors(input_tensor).apply(
-            batching.dense_to_sparse_batch(4,
-                                           [12])).make_initializable_iterator())
+    iterator = (dataset_ops.Dataset.from_tensors(input_tensor).apply(
+        batching.dense_to_sparse_batch(4, [12])).make_initializable_iterator())
     init_op = iterator.initializer
     get_next = sparse_tensor.SparseTensor(*iterator.get_next())
 
@@ -374,7 +356,8 @@ class BatchDatasetTest(test.TestCase):
 
   def testUnbatchMultiElementTupleDataset(self):
     data = tuple([(math_ops.range(10 * i, 10 * i + 10),
-                   array_ops.fill([10], "hi")) for i in range(3)])
+                   array_ops.fill([10], "hi"))
+                  for i in range(3)])
     data = dataset_ops.Dataset.from_tensor_slices(data)
     expected_types = ((dtypes.int32, dtypes.string),) * 3
     data = data.batch(2)
@@ -387,7 +370,9 @@ class BatchDatasetTest(test.TestCase):
 
     with self.test_session() as sess:
       for i in range(10):
-        self.assertEqual(((i, b"hi"), (10 + i, b"hi"), (20 + i, b"hi")),
+        self.assertEqual(((i, b"hi"),
+                          (10 + i, b"hi"),
+                          (20 + i, b"hi")),
                          sess.run(op))
 
       with self.assertRaises(errors.OutOfRangeError):
@@ -400,10 +385,9 @@ class BatchDatasetTest(test.TestCase):
 
     batch_size = array_ops.placeholder(dtypes.int64, shape=[])
 
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).apply(
-            batching.batch_and_drop_remainder(batch_size))
-        .make_initializable_iterator())
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components).apply(
+        batching.batch_and_drop_remainder(batch_size))
+                .make_initializable_iterator())
 
     next_element = iterator.get_next()
 
@@ -420,51 +404,14 @@ class BatchDatasetTest(test.TestCase):
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(next_element)
 
-  def testPaddedBatchAndDropRemainder(self):
-    els = []
-    for length in [3, 6, 9, 4, 12, 10, 2]:
-      els.append((np.array(length), np.arange(length) + 1,
-                  np.array(length * 2)))
-
-    dataset = dataset_ops.Dataset.from_tensors(els[0])
-    for el in els[1:]:
-      dataset = dataset.concatenate(dataset_ops.Dataset.from_tensors(el))
-
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = (
-        dataset.apply(
-            batching.padded_batch_and_drop_remainder(
-                batch_size, ([], [None], []))).make_initializable_iterator())
-
-    next_element = iterator.get_next()
-
-    with self.test_session() as sess:
-      for test_batch_size in [1, 3, 7, 10]:
-        sess.run(iterator.initializer, feed_dict={batch_size: test_batch_size})
-        num_batches = 7 // test_batch_size
-        for i in range(num_batches):
-          result = sess.run(next_element)
-          for component_idx, result_component in enumerate(result):
-            for j in range(test_batch_size):
-              data_idx = i * test_batch_size + j
-              comp = result_component[j]
-              unpadded = comp[comp > 0]
-              if np.isscalar(comp):
-                # The boolean mask indexing above adds a dim back. Rm it.
-                unpadded = unpadded[0]
-              self.assertAllEqual(els[data_idx][component_idx], unpadded)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(next_element)
-
   def testBatchAndDropRemainderShapeInference(self):
-    components = (array_ops.placeholder(dtypes.int32),
-                  (array_ops.placeholder(dtypes.int32, shape=[None]),
-                   array_ops.placeholder(dtypes.int32, shape=[20, 30])))
+    components = (array_ops.placeholder(dtypes.int32), (array_ops.placeholder(
+        dtypes.int32, shape=[None]), array_ops.placeholder(
+            dtypes.int32, shape=[20, 30])))
 
     # Test with a statically known batch size.
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(components).apply(
-            batching.batch_and_drop_remainder(128)))
+    dataset = (dataset_ops.Dataset.from_tensor_slices(components).apply(
+        batching.batch_and_drop_remainder(128)))
 
     self.assertIs(None, dataset.output_shapes[0].ndims)
     self.assertEqual([128], dataset.output_shapes[1][0].as_list())
@@ -473,9 +420,8 @@ class BatchDatasetTest(test.TestCase):
     # Test with a dynamic batch size: the static shape will be unknown, because
     # `batch_size` is a placeholder.
     batch_size = array_ops.placeholder(dtypes.int64)
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(components).apply(
-            batching.batch_and_drop_remainder(batch_size)))
+    dataset = (dataset_ops.Dataset.from_tensor_slices(components).apply(
+        batching.batch_and_drop_remainder(batch_size)))
 
     self.assertIs(None, dataset.output_shapes[0].ndims)
     self.assertEqual([None], dataset.output_shapes[1][0].as_list())
@@ -495,10 +441,9 @@ class BatchDatasetTest(test.TestCase):
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).repeat(count).apply(
-            batching.map_and_batch(_map_fn, batch_size))
-        .make_initializable_iterator())
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components).repeat(count)
+                .apply(batching.map_and_batch(_map_fn, batch_size))
+                .make_initializable_iterator())
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -514,7 +459,7 @@ class BatchDatasetTest(test.TestCase):
         result = sess.run(get_next)
         for component, result_component in zip(components, result):
           for j in range(14):
-            self.assertAllEqual(component[(i * 14 + j) % 7]**2,
+            self.assertAllEqual(component[(i*14 + j) % 7]**2,
                                 result_component[j])
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
@@ -529,7 +474,7 @@ class BatchDatasetTest(test.TestCase):
         result = sess.run(get_next)
         for component, result_component in zip(components, result):
           for j in range(8):
-            self.assertAllEqual(component[(i * 8 + j) % 7]**2,
+            self.assertAllEqual(component[(i*8 + j) % 7]**2,
                                 result_component[j])
       # The last batch should fail with `OutOfRange`.
       with self.assertRaises(errors.OutOfRangeError):
@@ -550,9 +495,8 @@ class BatchDatasetTest(test.TestCase):
         array_ops.check_numerics(
             constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
     batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = (
-        dataset.apply(batching.map_and_batch(lambda x: x, batch_size))
-        .make_initializable_iterator())
+    iterator = (dataset.apply(batching.map_and_batch(lambda x: x, batch_size))
+                .make_initializable_iterator())
     init_op = iterator.initializer
     with self.test_session() as sess:
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
@@ -560,7 +504,6 @@ class BatchDatasetTest(test.TestCase):
 
   def testBatchAndMapDatasetShapeMismatch(self):
     """Test a dataset that maps a TF function across its input elements."""
-
     def generator():
       yield [1]
       yield [2]
diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
index 07fecf04fa..df9147af6c 100644
--- a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
@@ -32,7 +32,7 @@ from tensorflow.python.util import nest
 
 
 class DatasetSerializationTestBase(test.TestCase):
-  """Base class for testing serializable datasets."""
+  """Base class for testing finite serializable datasets."""
 
   def tearDown(self):
     self._delete_ckpt()
@@ -58,19 +58,17 @@ class DatasetSerializationTestBase(test.TestCase):
     if ds_fn2:
       self.verify_restore_in_modified_graph(ds_fn1, ds_fn2, num_outputs)
 
-  def verify_unused_iterator(self, ds_fn, num_outputs, verify_exhausted=True):
+  def verify_unused_iterator(self, ds_fn, num_outputs):
     """Verifies that saving and restoring an unused iterator works.
 
     Args:
       ds_fn: See `run_core_tests`.
       num_outputs: See `run_core_tests`.
-      verify_exhausted: See `gen_outputs`.
 
     Raises:
       AssertionError if any test fails.
     """
-    self.verify_run_with_breaks(
-        ds_fn, [0], num_outputs, verify_exhausted=verify_exhausted)
+    self.verify_run_with_breaks(ds_fn, [0], num_outputs)
 
   def verify_fully_used_iterator(self, ds_fn, num_outputs):
     """Verifies that saving and restoring a fully used iterator works.
@@ -106,16 +104,12 @@ class DatasetSerializationTestBase(test.TestCase):
         ds_fn, [], 0, ckpt_saved=True, verify_exhausted=True)
     self.assertEqual(len(actual), 0)
 
-  def verify_init_before_restore(self,
-                                 ds_fn,
-                                 num_outputs,
-                                 verify_exhausted=True):
+  def verify_init_before_restore(self, ds_fn, num_outputs):
     """Verifies that retoring into an already initilized iterator works.
 
     Args:
       ds_fn: See `run_core_tests`.
       num_outputs: See `run_core_tests`.
-      verify_exhausted: See `gen_outputs`.
 
     Raises:
       AssertionError if any test fails.
@@ -124,14 +118,9 @@ class DatasetSerializationTestBase(test.TestCase):
         ds_fn,
         self.gen_break_points(num_outputs),
         num_outputs,
-        init_before_restore=True,
-        verify_exhausted=verify_exhausted)
+        init_before_restore=True)
 
-  def verify_multiple_breaks(self,
-                             ds_fn,
-                             num_outputs,
-                             num_breaks=10,
-                             verify_exhausted=True):
+  def verify_multiple_breaks(self, ds_fn, num_outputs, num_breaks=10):
     """Attempts to save/restore at multiple break points.
 
     Args:
@@ -139,22 +128,16 @@ class DatasetSerializationTestBase(test.TestCase):
       num_outputs: See `run_core_tests`.
       num_breaks: The number of break points. These are uniformly spread in
         [0, num_outputs] both inclusive.
-      verify_exhausted: See `gen_outputs`.
 
     Raises:
       AssertionError if any test fails.
     """
-    self.verify_run_with_breaks(
-        ds_fn,
-        self.gen_break_points(num_outputs),
-        num_outputs,
-        verify_exhausted=verify_exhausted)
+    self.verify_run_with_breaks(ds_fn,
+                                self.gen_break_points(num_outputs, num_breaks),
+                                num_outputs)
 
-  def verify_reset_restored_iterator(self,
-                                     ds_fn,
-                                     num_outputs,
-                                     break_point=None,
-                                     verify_exhausted=True):
+  def verify_reset_restored_iterator(self, ds_fn, num_outputs,
+                                     break_point=None):
     """Attempts to re-initialize a restored iterator.
 
     This is useful when restoring a training checkpoint during validation.
@@ -163,7 +146,6 @@ class DatasetSerializationTestBase(test.TestCase):
       ds_fn: See `run_core_tests`.
       num_outputs: See `run_core_tests`.
       break_point: Break point. Optional. Defaults to num_outputs/2.
-      verify_exhausted: See `gen_outputs`.
 
     Raises:
       AssertionError if any test fails.
@@ -171,8 +153,7 @@ class DatasetSerializationTestBase(test.TestCase):
     break_point = num_outputs // 2 if not break_point else break_point
 
     # Collect ground truth containing all outputs.
-    expected = self.gen_outputs(
-        ds_fn, [], num_outputs, verify_exhausted=verify_exhausted)
+    expected = self.gen_outputs(ds_fn, [], num_outputs, verify_exhausted=True)
 
     # Skip some items and save checkpoint.
     self.gen_outputs(ds_fn, [], break_point, verify_exhausted=False)
@@ -187,17 +168,15 @@ class DatasetSerializationTestBase(test.TestCase):
         sess.run(init_op)
         for _ in range(num_outputs):
           actual.append(sess.run(get_next_op))
-        if verify_exhausted:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next_op)
     self.match(expected, actual)
 
   def verify_restore_in_modified_graph(self,
                                        ds_fn1,
                                        ds_fn2,
                                        num_outputs,
-                                       break_point=None,
-                                       verify_exhausted=True):
+                                       break_point=None):
     """Attempts to restore an iterator in a modified graph.
 
     Builds an input pipeline using ds_fn1, runs it for `break_point` steps
@@ -209,7 +188,6 @@ class DatasetSerializationTestBase(test.TestCase):
       ds_fn2: See `run_core_tests`.
       num_outputs: See `run_core_tests`.
       break_point: Break point. Optional. Defaults to num_outputs/2.
-      verify_exhausted: See `gen_outputs`.
 
     Raises:
       AssertionError if any test fails.
@@ -218,15 +196,15 @@ class DatasetSerializationTestBase(test.TestCase):
 
     # Skip `break_point` items and store the remaining produced from ds_fn1
     # in `expected`.
-    self.gen_outputs(ds_fn1, [], break_point, verify_exhausted=False)
+    self.gen_outputs(ds_fn1, [], break_point)
     expected = self.gen_outputs(
         ds_fn1, [],
         num_outputs - break_point,
         ckpt_saved=True,
-        verify_exhausted=verify_exhausted)
+        verify_exhausted=True)
 
     # Generate `break_point` items from ds_fn1 and save checkpoint.
-    self.gen_outputs(ds_fn1, [], break_point, verify_exhausted=False)
+    self.gen_outputs(ds_fn1, [], break_point)
 
     actual = []
     # Build graph for ds_fn2 but load checkpoint for ds_fn1.
@@ -236,9 +214,8 @@ class DatasetSerializationTestBase(test.TestCase):
         self._restore(saver, sess)
         for _ in range(num_outputs - break_point):
           actual.append(sess.run(get_next_op))
-        if verify_exhausted:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next_op)
 
     self.match(expected, actual)
 
@@ -246,7 +223,6 @@ class DatasetSerializationTestBase(test.TestCase):
                              ds_fn,
                              break_points,
                              num_outputs,
-                             verify_exhausted=True,
                              init_before_restore=False):
     """Verifies that ds_fn() produces the same outputs with and without breaks.
 
@@ -261,7 +237,6 @@ class DatasetSerializationTestBase(test.TestCase):
       ds_fn: See `gen_outputs`.
       break_points: See `gen_outputs`.
       num_outputs: See `gen_outputs`.
-      verify_exhausted: See `gen_outputs`.
       init_before_restore: See `gen_outputs`.
 
     Raises:
@@ -270,13 +245,13 @@ class DatasetSerializationTestBase(test.TestCase):
     expected = self.gen_outputs(
         ds_fn, [],
         num_outputs,
-        verify_exhausted=verify_exhausted,
+        verify_exhausted=True,
         init_before_restore=init_before_restore)
     actual = self.gen_outputs(
         ds_fn,
         break_points,
         num_outputs,
-        verify_exhausted=verify_exhausted,
+        verify_exhausted=True,
         init_before_restore=init_before_restore)
     self.match(expected, actual)
 
@@ -286,7 +261,7 @@ class DatasetSerializationTestBase(test.TestCase):
                   num_outputs,
                   ckpt_saved=False,
                   init_before_restore=False,
-                  verify_exhausted=True):
+                  verify_exhausted=False):
     """Generates elements from input dataset while stopping at break points.
 
     Produces `num_outputs` outputs and saves the state of the iterator in the
@@ -310,7 +285,7 @@ class DatasetSerializationTestBase(test.TestCase):
         after producing `num_outputs` elements.
 
     Returns:
-      A list of `num_outputs` items.
+      A list if `num_outputs` items.
     """
     outputs = []
 
@@ -337,11 +312,11 @@ class DatasetSerializationTestBase(test.TestCase):
           num_iters = end - start
           for _ in range(num_iters):
             outputs.append(sess.run(get_next_op))
+          self._save(sess, saver)
+          ckpt_saved = True
           if i == len(break_points) and verify_exhausted:
             with self.assertRaises(errors.OutOfRangeError):
               sess.run(get_next_op)
-          self._save(sess, saver)
-          ckpt_saved = True
 
     return outputs
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
index bda9a2a4a3..271d80a54b 100644
--- a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
@@ -21,6 +21,7 @@ import os
 import numpy as np
 
 from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
 from tensorflow.contrib.data.python.ops import readers
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
@@ -33,7 +34,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import math_ops
diff --git a/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
index f59ac760dc..329dc80ba5 100644
--- a/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
@@ -21,6 +21,7 @@ import os
 
 from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.contrib.data.python.ops import enumerate_ops
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
 from tensorflow.contrib.data.python.ops import iterator_ops as contrib_iterator_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import constant_op
@@ -29,7 +30,6 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import variables
diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
index 3ae8f71d77..8033f1d388 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
@@ -21,6 +21,7 @@ import gzip
 import os
 import zlib
 
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
 from tensorflow.contrib.data.python.ops import iterator_ops as contrib_iterator_ops
 from tensorflow.contrib.data.python.ops import readers
 from tensorflow.core.example import example_pb2
@@ -33,7 +34,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
index 1a26da82e5..91615e9f62 100644
--- a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -208,82 +207,5 @@ class SequenceDatasetTest(test.TestCase):
         sess.run(get_next)
 
 
-class SequenceDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
-
-  def _build_skip_dataset(self, count):
-    components = (np.arange(10),)
-    return dataset_ops.Dataset.from_tensor_slices(components).skip(count)
-
-  def testSkipFewerThanInputs(self):
-    count = 4
-    num_outputs = 10 - count
-    self.run_core_tests(lambda: self._build_skip_dataset(count),
-                        lambda: self._build_skip_dataset(count + 2),
-                        num_outputs)
-
-  def testSkipVarious(self):
-    # Skip more than inputs
-    self.run_core_tests(lambda: self._build_skip_dataset(20), None, 0)
-    # Skip exactly the input size
-    self.run_core_tests(lambda: self._build_skip_dataset(10), None, 0)
-    self.run_core_tests(lambda: self._build_skip_dataset(-1), None, 0)
-    # Skip nothing
-    self.run_core_tests(lambda: self._build_skip_dataset(0), None, 10)
-
-  def _build_take_dataset(self, count):
-    components = (np.arange(10),)
-    return dataset_ops.Dataset.from_tensor_slices(components).take(count)
-
-  def testTakeFewerThanInputs(self):
-    count = 4
-    self.run_core_tests(
-        lambda: self._build_take_dataset(count),
-        lambda: self._build_take_dataset(count + 2),
-        count,
-    )
-
-  def testTakeVarious(self):
-    # Take more than inputs
-    self.run_core_tests(lambda: self._build_take_dataset(20), None, 10)
-    # Take exactly the input size
-    self.run_core_tests(lambda: self._build_take_dataset(10), None, 10)
-    # Take all
-    self.run_core_tests(lambda: self._build_take_dataset(-1), None, 10)
-    # Take nothing
-    self.run_core_tests(lambda: self._build_take_dataset(0), None, 0)
-
-  def _build_repeat_dataset(self, count, take_count=3):
-    components = (np.arange(10),)
-    return dataset_ops.Dataset.from_tensor_slices(components).take(
-        take_count).repeat(count)
-
-  def testFiniteRepeat(self):
-    count = 10
-    self.run_core_tests(lambda: self._build_repeat_dataset(count),
-                        lambda: self._build_repeat_dataset(count + 2),
-                        3 * count)
-
-  def testEmptyRepeat(self):
-    self.run_core_tests(lambda: self._build_repeat_dataset(0), None, 0)
-
-  def testInfiniteRepeat(self):
-    self.verify_unused_iterator(
-        lambda: self._build_repeat_dataset(-1), 10, verify_exhausted=False)
-    self.verify_init_before_restore(
-        lambda: self._build_repeat_dataset(-1), 10, verify_exhausted=False)
-    self.verify_multiple_breaks(
-        lambda: self._build_repeat_dataset(-1), 20, verify_exhausted=False)
-    self.verify_reset_restored_iterator(
-        lambda: self._build_repeat_dataset(-1), 20, verify_exhausted=False)
-    self.verify_restore_in_modified_graph(
-        lambda: self._build_repeat_dataset(-1),
-        lambda: self._build_repeat_dataset(2),
-        20,
-        verify_exhausted=False)
-    # Test repeat empty dataset
-    self.run_core_tests(lambda: self._build_repeat_dataset(-1, 0), None, 0)
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/zip_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/zip_dataset_op_test.py
index 5d34b0024c..b0e7218301 100644
--- a/tensorflow/contrib/data/python/kernel_tests/zip_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/zip_dataset_op_test.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -111,31 +110,5 @@ class ZipDatasetTest(test.TestCase):
         sess.run(get_next)
 
 
-class ZipDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
-
-  def _build_dataset(self, arr):
-    components = [
-        np.tile(np.array([[1], [2], [3], [4]]), 20),
-        np.tile(np.array([[12], [13], [14], [15]]), 22),
-        np.array(arr)
-    ]
-    datasets = [
-        dataset_ops.Dataset.from_tensor_slices(component)
-        for component in components
-    ]
-    return dataset_ops.Dataset.zip((datasets[0], (datasets[1], datasets[2])))
-
-  def testCore(self):
-    # Equal length components
-    arr = [37.0, 38.0, 39.0, 40.0]
-    num_outputs = len(arr)
-    self.run_core_tests(lambda: self._build_dataset(arr), None, num_outputs)
-    # Variable length components
-    diff_size_arr = [1.0, 2.0]
-    self.run_core_tests(lambda: self._build_dataset(diff_size_arr),
-                        lambda: self._build_dataset(arr), 2)
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index 1b81cf5be9..727c5d1c38 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -12,20 +12,6 @@ load(
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 py_library(
-    name = "dataset_ops",
-    srcs = [
-        "dataset_ops.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":transformation_ops",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-    ],
-)
-
-py_library(
     name = "iterator_ops",
     srcs = [
         "iterator_ops.py",
@@ -73,6 +59,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":gen_dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dataset_ops_gen",
@@ -128,6 +115,31 @@ tf_custom_op_py_library(
     ],
 )
 
+tf_gen_op_wrapper_py(
+    name = "gen_dataset_ops",
+    out = "gen_dataset_ops.py",
+    deps = ["//tensorflow/contrib/data:dataset_ops_op_lib"],
+)
+
+tf_custom_op_py_library(
+    name = "dataset_ops",
+    srcs = ["dataset_ops.py"],
+    dso = ["//tensorflow/contrib/data:_dataset_ops.so"],
+    kernels = [
+        "//tensorflow/contrib/data:dataset_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gen_dataset_ops",
+        ":transformation_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index d4ade7adfd..e6e5f716b6 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
@@ -24,7 +25,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import math_ops
 
 
@@ -103,42 +103,6 @@ def unbatch():
   return _apply_fn
 
 
-def filter_irregular_batches(batch_size):
-  """Transformation that filters out batches that are not of size batch_size."""
-
-  def _apply_fn(dataset):
-    """Function from `Dataset` to `Dataset` that applies the transformation."""
-    tensor_batch_size = ops.convert_to_tensor(
-        batch_size, dtype=dtypes.int64, name="batch_size")
-
-    flattened = _RestructuredDataset(dataset,
-                                     tuple(nest.flatten(dataset.output_types)))
-
-    def _predicate(*xs):
-      """Return `True` if this element is a full batch."""
-      # Extract the dynamic batch size from the first component of the flattened
-      # batched element.
-      first_component = xs[0]
-      first_component_batch_size = array_ops.shape(
-          first_component, out_type=dtypes.int64)[0]
-
-      return math_ops.equal(first_component_batch_size, tensor_batch_size)
-
-    filtered = flattened.filter(_predicate)
-
-    maybe_constant_batch_size = tensor_util.constant_value(tensor_batch_size)
-
-    def _set_first_dimension(shape):
-      return shape.merge_with(
-          tensor_shape.vector(maybe_constant_batch_size).concatenate(shape[1:]))
-
-    known_shapes = nest.map_structure(_set_first_dimension,
-                                      dataset.output_shapes)
-    return _RestructuredDataset(filtered, dataset.output_types, known_shapes)
-
-  return _apply_fn
-
-
 def batch_and_drop_remainder(batch_size):
   """A batching transformation that omits the final small batch (if present).
 
@@ -171,43 +135,34 @@ def batch_and_drop_remainder(batch_size):
 
   def _apply_fn(dataset):
     """Function from `Dataset` to `Dataset` that applies the transformation."""
-    batched = dataset.batch(batch_size)
-    return filter_irregular_batches(batch_size)(batched)
-
-  return _apply_fn
+    tensor_batch_size = ops.convert_to_tensor(
+        batch_size, dtype=dtypes.int64, name="batch_size")
 
+    batched = dataset.batch(tensor_batch_size)
+    flattened = _RestructuredDataset(batched,
+                                     tuple(nest.flatten(batched.output_types)))
 
-def padded_batch_and_drop_remainder(batch_size,
-                                    padded_shapes,
-                                    padding_values=None):
-  """A batching and padding transformation that omits the final small batch.
+    def _predicate(*xs):
+      """Return `True` if this element is a full batch."""
+      # Extract the dynamic batch size from the first component of the flattened
+      # batched element.
+      first_component = xs[0]
+      first_component_batch_size = array_ops.shape(
+          first_component, out_type=dtypes.int64)[0]
 
-  Like @{tf.data.Dataset.padded_batch}, this transformation combines
-  consecutive elements of this dataset into batches. However, if the batch
-  size does not evenly divide the input dataset size, this transformation will
-  drop the final smaller element.
+      return math_ops.equal(first_component_batch_size, tensor_batch_size)
 
-  See `@{tf.contrib.data.batch_and_drop_remainder}` for more details.
+    filtered = flattened.filter(_predicate)
 
-  Args:
-    batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
-      consecutive elements of this dataset to combine in a single batch.
-    padded_shapes: A nested structure of `tf.TensorShape` or
-      `tf.int64` vector tensor-like objects. See
-      @{tf.data.Dataset.padded_batch} for details.
-    padding_values: (Optional.) A nested structure of scalar-shaped
-      `tf.Tensor`. See @{tf.data.Dataset.padded_batch} for details.
+    maybe_constant_batch_size = tensor_util.constant_value(tensor_batch_size)
 
-  Returns:
-    A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}
-  """
+    def _set_first_dimension(shape):
+      return shape.merge_with(
+          tensor_shape.vector(maybe_constant_batch_size).concatenate(shape[1:]))
 
-  def _apply_fn(dataset):
-    """Function from `Dataset` to `Dataset` that applies the transformation."""
-    batched = dataset.padded_batch(
-        batch_size, padded_shapes=padded_shapes, padding_values=padding_values)
-    return filter_irregular_batches(batch_size)(batched)
+    known_shapes = nest.map_structure(_set_first_dimension,
+                                      batched.output_shapes)
+    return _RestructuredDataset(filtered, batched.output_types, known_shapes)
 
   return _apply_fn
 
diff --git a/tensorflow/contrib/data/python/ops/dataset_ops.py b/tensorflow/contrib/data/python/ops/dataset_ops.py
index 45d6dbe743..c4c4426809 100644
--- a/tensorflow/contrib/data/python/ops/dataset_ops.py
+++ b/tensorflow/contrib/data/python/ops/dataset_ops.py
@@ -20,15 +20,21 @@ from __future__ import print_function
 from tensorflow.contrib.data.python.ops import batching
 from tensorflow.contrib.data.python.ops import enumerate_ops
 from tensorflow.contrib.data.python.ops import error_ops
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
 from tensorflow.contrib.data.python.ops import grouping
 
+from tensorflow.contrib.util import loader
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
-from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_io_ops
+from tensorflow.python.platform import resource_loader
 from tensorflow.python.util import deprecation
 
 
+_dataset_ops = loader.load_op_library(
+    resource_loader.get_path_to_datafile("../../_dataset_ops.so"))
+
+
 class Dataset(dataset_ops.Dataset):
   """Represents a potentially large set of elements.
 
diff --git a/tensorflow/contrib/data/python/ops/error_ops.py b/tensorflow/contrib/data/python/ops/error_ops.py
index 238bb52b02..51a2791072 100644
--- a/tensorflow/contrib/data/python/ops/error_ops.py
+++ b/tensorflow/contrib/data/python/ops/error_ops.py
@@ -17,9 +17,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
-from tensorflow.python.ops import gen_dataset_ops
 
 
 def ignore_errors():
diff --git a/tensorflow/contrib/data/python/ops/grouping.py b/tensorflow/contrib/data/python/ops/grouping.py
index 6df7b22fb6..1c7c94b3c8 100644
--- a/tensorflow/contrib/data/python/ops/grouping.py
+++ b/tensorflow/contrib/data/python/ops/grouping.py
@@ -17,12 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_dataset_ops
 
 
 def group_by_window(key_func,
diff --git a/tensorflow/contrib/data/python/ops/interleave_ops.py b/tensorflow/contrib/data/python/ops/interleave_ops.py
index 74a919c1ff..ce23e95697 100644
--- a/tensorflow/contrib/data/python/ops/interleave_ops.py
+++ b/tensorflow/contrib/data/python/ops/interleave_ops.py
@@ -17,12 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.util import deprecation
 
 
diff --git a/tensorflow/contrib/data/python/ops/iterator_ops.py b/tensorflow/contrib/data/python/ops/iterator_ops.py
index d736029fb0..32d2f42c93 100644
--- a/tensorflow/contrib/data/python/ops/iterator_ops.py
+++ b/tensorflow/contrib/data/python/ops/iterator_ops.py
@@ -17,8 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.training import saver
 
 
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index 2e1c3153ca..f22298b757 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.data.python.ops import dataset_ops as contrib_dataset_ops
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
 from tensorflow.python.data.util import nest
@@ -25,7 +26,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.util import deprecation
diff --git a/tensorflow/contrib/data/python/ops/scan_ops.py b/tensorflow/contrib/data/python/ops/scan_ops.py
index 5acaed48a3..87bbbb7d19 100644
--- a/tensorflow/contrib/data/python/ops/scan_ops.py
+++ b/tensorflow/contrib/data/python/ops/scan_ops.py
@@ -19,11 +19,11 @@ from __future__ import print_function
 
 import collections
 
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_dataset_ops
 
 
 class _ScanDataset(dataset_ops.Dataset):
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 2dc8ad9483..145b9495ff 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -141,6 +141,23 @@ cuda_py_test(
 )
 
 cuda_py_test(
+    name = "cauchy_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/cauchy_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+    ],
+)
+
+cuda_py_test(
     name = "chi2_test",
     srcs = ["python/kernel_tests/chi2_test.py"],
     additional_deps = [
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 16f6533e57..0d12d83893 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -24,6 +24,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops import bijectors
 from tensorflow.contrib.distributions.python.ops.binomial import *
+from tensorflow.contrib.distributions.python.ops.cauchy import *
 from tensorflow.contrib.distributions.python.ops.chi2 import *
 from tensorflow.contrib.distributions.python.ops.conditional_distribution import *
 from tensorflow.contrib.distributions.python.ops.conditional_transformed_distribution import *
@@ -83,6 +84,7 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
     'bijectors',
+    'Cauchy',
     'ConditionalDistribution',
     'ConditionalTransformedDistribution',
     'FULLY_REPARAMETERIZED',
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py b/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py
new file mode 100644
index 0000000000..7f7697357c
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py
@@ -0,0 +1,437 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Cauchy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import importlib
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import cauchy as cauchy_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+stats = try_import("scipy.stats")
+
+
+class CauchyTest(test.TestCase):
+
+  def setUp(self):
+    self._rng = np.random.RandomState(123)
+
+  def assertAllFinite(self, tensor):
+    is_finite = np.isfinite(tensor.eval())
+    all_true = np.ones_like(is_finite, dtype=np.bool)
+    self.assertAllEqual(all_true, is_finite)
+
+  def _testParamShapes(self, sample_shape, expected):
+    with self.test_session():
+      param_shapes = cauchy_lib.Cauchy.param_shapes(sample_shape)
+      loc_shape, scale_shape = param_shapes["loc"], param_shapes["scale"]
+      self.assertAllEqual(expected, loc_shape.eval())
+      self.assertAllEqual(expected, scale_shape.eval())
+      loc = array_ops.zeros(loc_shape)
+      scale = array_ops.ones(scale_shape)
+      self.assertAllEqual(
+          expected,
+          array_ops.shape(cauchy_lib.Cauchy(loc, scale).sample()).eval())
+
+  def _testParamStaticShapes(self, sample_shape, expected):
+    param_shapes = cauchy_lib.Cauchy.param_static_shapes(sample_shape)
+    loc_shape, scale_shape = param_shapes["loc"], param_shapes["scale"]
+    self.assertEqual(expected, loc_shape)
+    self.assertEqual(expected, scale_shape)
+
+  def testParamShapes(self):
+    sample_shape = [10, 3, 4]
+    self._testParamShapes(sample_shape, sample_shape)
+    self._testParamShapes(constant_op.constant(sample_shape), sample_shape)
+
+  def testParamStaticShapes(self):
+    sample_shape = [10, 3, 4]
+    self._testParamStaticShapes(sample_shape, sample_shape)
+    self._testParamStaticShapes(
+        tensor_shape.TensorShape(sample_shape), sample_shape)
+
+  def testCauchyLogPDF(self):
+    with self.test_session():
+      batch_size = 6
+      loc = constant_op.constant([3.0] * batch_size)
+      scale = constant_op.constant([np.sqrt(10.0)] * batch_size)
+      x = np.array([-2.5, 2.5, 4.0, 0.0, -1.0, 2.0], dtype=np.float32)
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      log_pdf = cauchy.log_prob(x)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(),
+                          log_pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(),
+                          log_pdf.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, log_pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape, log_pdf.eval().shape)
+
+      pdf = cauchy.prob(x)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), pdf.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape, pdf.eval().shape)
+
+      if not stats:
+        return
+      expected_log_pdf = stats.cauchy(loc.eval(), scale.eval()).logpdf(x)
+      self.assertAllClose(expected_log_pdf, log_pdf.eval())
+      self.assertAllClose(np.exp(expected_log_pdf), pdf.eval())
+
+  def testCauchyLogPDFMultidimensional(self):
+    with self.test_session():
+      batch_size = 6
+      loc = constant_op.constant([[3.0, -3.0]] * batch_size)
+      scale = constant_op.constant([[np.sqrt(10.0), np.sqrt(15.0)]] *
+                                   batch_size)
+      x = np.array([[-2.5, 2.5, 4.0, 0.0, -1.0, 2.0]], dtype=np.float32).T
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      log_pdf = cauchy.log_prob(x)
+      log_pdf_values = log_pdf.eval()
+      self.assertEqual(log_pdf.shape, (6, 2))
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(),
+                          log_pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(),
+                          log_pdf.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, log_pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape, log_pdf.eval().shape)
+
+      pdf = cauchy.prob(x)
+      pdf_values = pdf.eval()
+      self.assertEqual(pdf.shape, (6, 2))
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), pdf_values.shape)
+      self.assertAllEqual(cauchy.batch_shape, pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape, pdf_values.shape)
+
+      if not stats:
+        return
+      expected_log_pdf = stats.cauchy(loc.eval(), scale.eval()).logpdf(x)
+      self.assertAllClose(expected_log_pdf, log_pdf_values)
+      self.assertAllClose(np.exp(expected_log_pdf), pdf_values)
+
+  def testCauchyCDF(self):
+    with self.test_session():
+      batch_size = 50
+      loc = self._rng.randn(batch_size)
+      scale = self._rng.rand(batch_size) + 1.0
+      x = np.linspace(-8.0, 8.0, batch_size).astype(np.float64)
+
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+      cdf = cauchy.cdf(x)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), cdf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), cdf.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, cdf.shape)
+      self.assertAllEqual(cauchy.batch_shape, cdf.eval().shape)
+      if not stats:
+        return
+      expected_cdf = stats.cauchy(loc, scale).cdf(x)
+      self.assertAllClose(expected_cdf, cdf.eval(), atol=0)
+
+  def testCauchySurvivalFunction(self):
+    with self.test_session():
+      batch_size = 50
+      loc = self._rng.randn(batch_size)
+      scale = self._rng.rand(batch_size) + 1.0
+      x = np.linspace(-8.0, 8.0, batch_size).astype(np.float64)
+
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      sf = cauchy.survival_function(x)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), sf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), sf.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, sf.shape)
+      self.assertAllEqual(cauchy.batch_shape, sf.eval().shape)
+      if not stats:
+        return
+      expected_sf = stats.cauchy(loc, scale).sf(x)
+      self.assertAllClose(expected_sf, sf.eval(), atol=0)
+
+  def testCauchyLogCDF(self):
+    with self.test_session():
+      batch_size = 50
+      loc = self._rng.randn(batch_size)
+      scale = self._rng.rand(batch_size) + 1.0
+      x = np.linspace(-100.0, 10.0, batch_size).astype(np.float64)
+
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      cdf = cauchy.log_cdf(x)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), cdf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), cdf.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, cdf.shape)
+      self.assertAllEqual(cauchy.batch_shape, cdf.eval().shape)
+
+      if not stats:
+        return
+      expected_cdf = stats.cauchy(loc, scale).logcdf(x)
+      self.assertAllClose(expected_cdf, cdf.eval(), atol=0, rtol=1e-5)
+
+  def testFiniteGradientAtDifficultPoints(self):
+    for dtype in [np.float32, np.float64]:
+      g = ops.Graph()
+      with g.as_default():
+        loc = variables.Variable(dtype(0.0))
+        scale = variables.Variable(dtype(1.0))
+        dist = cauchy_lib.Cauchy(loc=loc, scale=scale)
+        x = np.array([-100., -20., -5., 0., 5., 20., 100.]).astype(dtype)
+        for func in [
+            dist.cdf, dist.log_cdf, dist.survival_function,
+            dist.log_survival_function, dist.log_prob, dist.prob
+        ]:
+          value = func(x)
+          grads = gradients_impl.gradients(value, [loc, scale])
+          with self.test_session(graph=g):
+            variables.global_variables_initializer().run()
+            self.assertAllFinite(value)
+            self.assertAllFinite(grads[0])
+            self.assertAllFinite(grads[1])
+
+  def testCauchyLogSurvivalFunction(self):
+    with self.test_session():
+      batch_size = 50
+      loc = self._rng.randn(batch_size)
+      scale = self._rng.rand(batch_size) + 1.0
+      x = np.linspace(-10.0, 100.0, batch_size).astype(np.float64)
+
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      sf = cauchy.log_survival_function(x)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), sf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), sf.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, sf.shape)
+      self.assertAllEqual(cauchy.batch_shape, sf.eval().shape)
+
+      if not stats:
+        return
+      expected_sf = stats.cauchy(loc, scale).logsf(x)
+      self.assertAllClose(expected_sf, sf.eval(), atol=0, rtol=1e-5)
+
+  def testCauchyEntropy(self):
+    with self.test_session():
+      loc = np.array([1.0, 1.0, 1.0])
+      scale = np.array([[1.0, 2.0, 3.0]])
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      entropy = cauchy.entropy()
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(),
+                          entropy.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(),
+                          entropy.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, entropy.shape)
+      self.assertAllEqual(cauchy.batch_shape, entropy.eval().shape)
+
+      if not stats:
+        return
+      expected_entropy = stats.cauchy(loc, scale).entropy()
+      self.assertAllClose(expected_entropy, entropy.eval())
+
+  def testCauchyMode(self):
+    with self.test_session():
+      # Mu will be broadcast to [7, 7, 7].
+      loc = [7.]
+      scale = [11., 12., 13.]
+
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      self.assertAllEqual((3,), cauchy.mode().shape)
+      self.assertAllEqual([7., 7, 7], cauchy.mode().eval())
+
+  def testCauchyMean(self):
+    with self.test_session():
+      loc = [1., 2., 3.]
+      scale = [7.]
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      self.assertAllEqual((3,), cauchy.mean().shape)
+      self.assertAllEqual([np.nan] * 3, cauchy.mean().eval())
+
+  def testCauchyNanMean(self):
+    with self.test_session():
+      loc = [1., 2., 3.]
+      scale = [7.]
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale, allow_nan_stats=False)
+
+      with self.assertRaises(ValueError):
+        cauchy.mean().eval()
+
+  def testCauchyQuantile(self):
+    with self.test_session():
+      batch_size = 50
+      loc = self._rng.randn(batch_size)
+      scale = self._rng.rand(batch_size) + 1.0
+      p = np.linspace(0.000001, 0.999999, batch_size).astype(np.float64)
+
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+      x = cauchy.quantile(p)
+
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), x.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), x.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, x.shape)
+      self.assertAllEqual(cauchy.batch_shape, x.eval().shape)
+
+      if not stats:
+        return
+      expected_x = stats.cauchy(loc, scale).ppf(p)
+      self.assertAllClose(expected_x, x.eval(), atol=0.)
+
+  def testCauchyVariance(self):
+    with self.test_session():
+      # scale will be broadcast to [7, 7, 7]
+      loc = [1., 2., 3.]
+      scale = [7.]
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      self.assertAllEqual((3,), cauchy.variance().shape)
+      self.assertAllEqual([np.nan] * 3, cauchy.variance().eval())
+
+  def testCauchyNanVariance(self):
+    with self.test_session():
+      # scale will be broadcast to [7, 7, 7]
+      loc = [1., 2., 3.]
+      scale = [7.]
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale, allow_nan_stats=False)
+
+      with self.assertRaises(ValueError):
+        cauchy.variance().eval()
+
+  def testCauchyStandardDeviation(self):
+    with self.test_session():
+      # scale will be broadcast to [7, 7, 7]
+      loc = [1., 2., 3.]
+      scale = [7.]
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      self.assertAllEqual((3,), cauchy.stddev().shape)
+      self.assertAllEqual([np.nan] * 3, cauchy.stddev().eval())
+
+  def testCauchyNanStandardDeviation(self):
+    with self.test_session():
+      # scale will be broadcast to [7, 7, 7]
+      loc = [1., 2., 3.]
+      scale = [7.]
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale, allow_nan_stats=False)
+
+      with self.assertRaises(ValueError):
+        cauchy.stddev().eval()
+
+  def testCauchySample(self):
+    with self.test_session():
+      loc = constant_op.constant(3.0)
+      scale = constant_op.constant(1.0)
+      loc_v = 3.0
+      n = constant_op.constant(100000)
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+      samples = cauchy.sample(n)
+      sample_values = samples.eval()
+
+      self.assertEqual(sample_values.shape, (100000,))
+      self.assertAllClose(np.median(sample_values), loc_v, atol=1e-1)
+
+      expected_shape = tensor_shape.TensorShape([n.eval()]).concatenate(
+          tensor_shape.TensorShape(cauchy.batch_shape_tensor().eval()))
+
+      self.assertAllEqual(expected_shape, samples.shape)
+      self.assertAllEqual(expected_shape, sample_values.shape)
+
+      expected_shape = (tensor_shape.TensorShape(
+          [n.eval()]).concatenate(cauchy.batch_shape))
+
+      self.assertAllEqual(expected_shape, samples.shape)
+      self.assertAllEqual(expected_shape, sample_values.shape)
+
+  def testCauchySampleMultiDimensional(self):
+    with self.test_session():
+      batch_size = 2
+      loc = constant_op.constant([[3.0, -3.0]] * batch_size)
+      scale = constant_op.constant([[0.5, 1.0]] * batch_size)
+      loc_v = [3.0, -3.0]
+      n = constant_op.constant(100000)
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+      samples = cauchy.sample(n)
+      sample_values = samples.eval()
+      self.assertEqual(samples.shape, (100000, batch_size, 2))
+      self.assertAllClose(np.median(sample_values[:, 0, 0]),
+                          loc_v[0], atol=1e-1)
+      self.assertAllClose(np.median(sample_values[:, 0, 1]),
+                          loc_v[1], atol=1e-1)
+
+      expected_shape = tensor_shape.TensorShape([n.eval()]).concatenate(
+          tensor_shape.TensorShape(cauchy.batch_shape_tensor().eval()))
+      self.assertAllEqual(expected_shape, samples.shape)
+      self.assertAllEqual(expected_shape, sample_values.shape)
+
+      expected_shape = (tensor_shape.TensorShape(
+          [n.eval()]).concatenate(cauchy.batch_shape))
+      self.assertAllEqual(expected_shape, samples.shape)
+      self.assertAllEqual(expected_shape, sample_values.shape)
+
+  def testCauchyNegativeLocFails(self):
+    with self.test_session():
+      cauchy = cauchy_lib.Cauchy(loc=[1.], scale=[-5.], validate_args=True)
+      with self.assertRaisesOpError("Condition x > 0 did not hold"):
+        cauchy.mode().eval()
+
+  def testCauchyShape(self):
+    with self.test_session():
+      loc = constant_op.constant([-3.0] * 5)
+      scale = constant_op.constant(11.0)
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      self.assertEqual(cauchy.batch_shape_tensor().eval(), [5])
+      self.assertEqual(cauchy.batch_shape, tensor_shape.TensorShape([5]))
+      self.assertAllEqual(cauchy.event_shape_tensor().eval(), [])
+      self.assertEqual(cauchy.event_shape, tensor_shape.TensorShape([]))
+
+  def testCauchyShapeWithPlaceholders(self):
+    loc = array_ops.placeholder(dtype=dtypes.float32)
+    scale = array_ops.placeholder(dtype=dtypes.float32)
+    cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+    with self.test_session() as sess:
+      # get_batch_shape should return an "<unknown>" tensor.
+      self.assertEqual(cauchy.batch_shape, tensor_shape.TensorShape(None))
+      self.assertEqual(cauchy.event_shape, ())
+      self.assertAllEqual(cauchy.event_shape_tensor().eval(), [])
+      self.assertAllEqual(
+          sess.run(cauchy.batch_shape_tensor(),
+                   feed_dict={loc: 5.0,
+                              scale: [1.0, 2.0]}), [2])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/cauchy.py b/tensorflow/contrib/distributions/python/ops/cauchy.py
new file mode 100644
index 0000000000..a17bb091f6
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/cauchy.py
@@ -0,0 +1,223 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Cauchy distribution class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+
+
+__all__ = [
+    "Cauchy",
+]
+
+
+class Cauchy(distribution.Distribution):
+  """The Cauchy distribution with location `loc` and scale `scale`.
+
+  #### Mathematical details
+
+  The probability density function (pdf) is,
+
+  ```none
+  pdf(x; loc, scale) = 1 / (pi * scale * (1 + ((x - loc) / scale)**2))
+  ```
+  where `loc` is the location, and `scale` is the scale.
+
+  The Cauchy distribution is a member of the [location-scale family](
+  https://en.wikipedia.org/wiki/Location-scale_family), i.e.
+
+  ```none
+  X ~ Cauchy(loc=0, scale=1)
+  Y ~ Cauchy(loc=loc, scale=scale)
+  Y = loc + scale * X
+  ```
+
+  #### Examples
+
+  Examples of initialization of one or a batch of distributions.
+
+  ```python
+  # Define a single scalar Cauchy distribution.
+  dist = Cauchy(loc=0., scale=3.)
+
+  # Evaluate the cdf at 1, returning a scalar.
+  dist.cdf(1.)
+
+  # Define a batch of two scalar valued Cauchy distributions.
+  dist = Cauchy(loc=[1, 2.], scale=[11, 22.])
+
+  # Evaluate the pdf of the first distribution on 0, and the second on 1.5,
+  # returning a length two tensor.
+  dist.prob([0, 1.5])
+
+  # Get 3 samples, returning a 3 x 2 tensor.
+  dist.sample([3])
+  ```
+
+  Arguments are broadcast when possible.
+
+  ```python
+  # Define a batch of two scalar valued Cauchy distributions.
+  # Both have median 1, but different scales.
+  dist = tf.contrib.distributions.Cauchy(loc=1., scale=[11, 22.])
+  # Evaluate the pdf of both distributions on the same point, 3.0,
+  # returning a length 2 tensor.
+  dist.prob(3.0)
+  ```
+  """
+
+  def __init__(self,
+               loc,
+               scale,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="Cauchy"):
+    """Construct Cauchy distributions with loc and and scale `loc` and `scale`.
+
+    The parameters `loc` and `scale` must be shaped in a way that supports
+    broadcasting (e.g. `loc + scale` is a valid operation).
+
+    Args:
+      loc: Floating point tensor; the modes of the distribution(s).
+      scale: Floating point tensor; the locations of the distribution(s).
+        Must contain only positive values.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
+        statistics (e.g., mean, mode, variance) use the value "`NaN`" to
+        indicate the result is undefined. When `False`, an exception is raised
+        if one or more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+
+    Raises:
+      TypeError: if `loc` and `scale` have different `dtype`.
+    """
+    parameters = locals()
+    with ops.name_scope(name, values=[loc, scale]):
+      with ops.control_dependencies([check_ops.assert_positive(scale)] if
+                                    validate_args else []):
+        self._loc = array_ops.identity(loc, name="loc")
+        self._scale = array_ops.identity(scale, name="scale")
+        check_ops.assert_same_float_dtype([self._loc, self._scale])
+    super(Cauchy, self).__init__(
+        dtype=self._scale.dtype,
+        reparameterization_type=distribution.FULLY_REPARAMETERIZED,
+        validate_args=validate_args,
+        allow_nan_stats=allow_nan_stats,
+        parameters=parameters,
+        graph_parents=[self._loc, self._scale],
+        name=name)
+
+  @staticmethod
+  def _param_shapes(sample_shape):
+    return dict(
+        zip(("loc", "scale"), ([ops.convert_to_tensor(
+            sample_shape, dtype=dtypes.int32)] * 2)))
+
+  @property
+  def loc(self):
+    """Distribution parameter for the mean."""
+    return self._loc
+
+  @property
+  def scale(self):
+    """Distribution parameter for standard deviation."""
+    return self._scale
+
+  def _batch_shape_tensor(self):
+    return array_ops.broadcast_dynamic_shape(
+        array_ops.shape(self.loc),
+        array_ops.shape(self.scale))
+
+  def _batch_shape(self):
+    return array_ops.broadcast_static_shape(
+        self.loc.shape,
+        self.scale.shape)
+
+  def _event_shape_tensor(self):
+    return constant_op.constant([], dtype=dtypes.int32)
+
+  def _event_shape(self):
+    return tensor_shape.scalar()
+
+  def _sample_n(self, n, seed=None):
+    shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
+    probs = random_ops.random_uniform(
+        shape=shape, minval=0., maxval=1., dtype=self.dtype, seed=seed)
+    return self._quantile(probs)
+
+  def _log_prob(self, x):
+    return self._log_unnormalized_prob(x) - self._log_normalization()
+
+  def _cdf(self, x):
+    return math_ops.atan(self._z(x)) / np.pi + 0.5
+
+  def _log_cdf(self, x):
+    return math_ops.log1p(2 / np.pi * math_ops.atan(self._z(x))) - np.log(2)
+
+  def _log_unnormalized_prob(self, x):
+    return -math_ops.log1p(math_ops.square(self._z(x)))
+
+  def _log_normalization(self):
+    return np.log(np.pi) + math_ops.log(self.scale)
+
+  def _entropy(self):
+    h = np.log(4 * np.pi) + math_ops.log(self.scale)
+    return h * array_ops.ones_like(self.loc)
+
+  def _quantile(self, p):
+    return self.loc + self.scale * math_ops.tan(np.pi * (p - 0.5))
+
+  def _mode(self):
+    return self.loc * array_ops.ones_like(self.scale)
+
+  def _z(self, x):
+    """Standardize input `x`."""
+    with ops.name_scope("standardize", values=[x]):
+      return (x - self.loc) / self.scale
+
+  def _inv_z(self, z):
+    """Reconstruct input `x` from a its normalized version."""
+    with ops.name_scope("reconstruct", values=[z]):
+      return z * self.scale + self.loc
+
+  def _mean(self):
+    if self.allow_nan_stats:
+      return array_ops.fill(self.batch_shape_tensor(),
+                            self.dtype.as_numpy_dtype(np.nan))
+    else:
+      raise ValueError("`mean` is undefined for Cauchy distribution.")
+
+  def _stddev(self):
+    if self.allow_nan_stats:
+      return array_ops.fill(self.batch_shape_tensor(),
+                            self.dtype.as_numpy_dtype(np.nan))
+    else:
+      raise ValueError("`stddev` is undefined for Cauchy distribution.")
diff --git a/tensorflow/contrib/eager/README.md b/tensorflow/contrib/eager/README.md
index dcc370cd00..ae4b07799f 100644
--- a/tensorflow/contrib/eager/README.md
+++ b/tensorflow/contrib/eager/README.md
@@ -1,4 +1,4 @@
-# Eager Execution
+# TensorFlow Eager Execution
 
 > *WARNING*: This is a preview/pre-alpha version. The API and performance
 > characteristics are subject to change.
diff --git a/tensorflow/contrib/eager/python/examples/notebooks/1_basics.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/1_basics.ipynb
index 01616f2e7d..459f2f4a7d 100644
--- a/tensorflow/contrib/eager/python/examples/notebooks/1_basics.ipynb
+++ b/tensorflow/contrib/eager/python/examples/notebooks/1_basics.ipynb
@@ -429,7 +429,9 @@
         "cpu_tensor = tf.random_normal([SIZE, SIZE])\n",
         "\n",
         "if is_gpu_available:\n",
-        "  gpu_tensor = cpu_tensor.gpu()"
+        "  gpu_tensor = cpu_tensor.gpu()\n",
+        "else:\n",
+        "  print(\"GPU not available.\")"
       ]
     },
     {
diff --git a/tensorflow/contrib/eager/python/examples/notebooks/2_gradients.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/2_gradients.ipynb
index 3b7e2cd435..e6c7c11733 100644
--- a/tensorflow/contrib/eager/python/examples/notebooks/2_gradients.ipynb
+++ b/tensorflow/contrib/eager/python/examples/notebooks/2_gradients.ipynb
@@ -383,7 +383,7 @@
         "\n",
         "`implicit_value_and_gradients()` returns a function that accepts the same inputs as the function passed in, and returns a tuple consisting of:\n",
         "\n",
-        "1. the value returned by the function passed in (in this case, the loss calculated by `calculate_linear_model_loss()`), and\n",
+        "1. the value returned by the function passed in (in this case, the loss calculated by `loss_fn()`), and\n",
         "1. a list of tuples consisting of:\n",
         "  1. The value of the gradient (a `tf.Tensor`) with respect to a given variable\n",
         "  1. The corresponding variable (`tf.Variable`)\n",
@@ -698,7 +698,7 @@
       "source": [
         "## Other Ways to Compute Gradients\n",
         "\n",
-        "Using our loss function as an example (`calculate_linear_model_loss()`), there are several other ways we could compute gradients:\n",
+        "Using our loss function as an example (`loss_fn()`), there are several other ways we could compute gradients:\n",
         "\n",
         "1. `tfe.implicit_gradients()`\n",
         "1. `tfe.gradients_function()`\n",
@@ -841,7 +841,7 @@
         "# tfe.implicit_value_and_gradients() demo\n",
         "value_gradients_fn = tfe.implicit_value_and_gradients(loss_fn)\n",
         "\n",
-        "# Returns only gradients:\n",
+        "# Returns the value returned by the function passed in, gradients, and variables:\n",
         "value_gradients_fn(inputs, labels, wb)"
       ]
     }
diff --git a/tensorflow/contrib/eager/python/examples/notebooks/3_datasets.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/3_datasets.ipynb
index ebcc7027c1..0088da5c4b 100644
--- a/tensorflow/contrib/eager/python/examples/notebooks/3_datasets.ipynb
+++ b/tensorflow/contrib/eager/python/examples/notebooks/3_datasets.ipynb
@@ -9,7 +9,7 @@
       "source": [
         "# Eager Execution Tutorial: Importing Data\n",
         "\n",
-        "This notebook demonstrates the use of the [`tf.contrib.data.Dataset` API](https://www.tensorflow.org/programmers_guide/datasets) to build pipelines to feed data to your program. It covers:\n",
+        "This notebook demonstrates the use of the [`tf.data.Dataset` API](https://www.tensorflow.org/programmers_guide/datasets) to build pipelines to feed data to your program. It covers:\n",
         "\n",
         "* Creating a `Dataset`.\n",
         "* Iteration over a `Dataset` with eager execution enabled.\n",
@@ -64,7 +64,7 @@
       "source": [
         "# Step 1: Create a source `Dataset`\n",
         "\n",
-        "Create a _source_ dataset using one of the factory functions like [`Dataset.from_tensors`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset#from_tensors), [`Dataset.from_tensor_slices`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset#from_tensor_slices) or using objects that read from files like [`TextLineDataset`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/TextLineDataset) or [`TFRecordDataset`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/TFRecordDataset). See the [Programmer's Guide](https://www.google.com/url?sa=D\u0026q=https%3A%2F%2Fwww.tensorflow.org%2Fprogrammers_guide%2Fdatasets%23reading_input_data) for more information."
+        "Create a _source_ dataset using one of the factory functions like [`Dataset.from_tensors`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_tensors), [`Dataset.from_tensor_slices`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_tensor_slices) or using objects that read from files like [`TextLineDataset`](https://www.tensorflow.org/api_docs/python/tf/data/TextLineDataset) or [`TFRecordDataset`](https://www.tensorflow.org/api_docs/python/tf/data/TFRecordDataset). See the [Programmer's Guide](https://www.google.com/url?sa=D\u0026q=https%3A%2F%2Fwww.tensorflow.org%2Fprogrammers_guide%2Fdatasets%23reading_input_data) for more information."
       ]
     },
     {
@@ -83,7 +83,7 @@
       },
       "outputs": [],
       "source": [
-        "ds_tensors = tf.contrib.data.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6])\n",
+        "ds_tensors = tf.data.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6])\n",
         "\n",
         "# Create a CSV file\n",
         "import tempfile\n",
@@ -93,7 +93,7 @@
         "Line 2\n",
         "Line 3\n",
         "  \"\"\")\n",
-        "ds_file = tf.contrib.data.TextLineDataset(filename)\n"
+        "ds_file = tf.data.TextLineDataset(filename)\n"
       ]
     },
     {
@@ -105,7 +105,7 @@
       "source": [
         "# Step 2: Apply transformations\n",
         "\n",
-        "Use the transformations functions like [`map`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset#map), [`batch`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset#batch), [`shuffle`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset#shuffle) etc. to apply transformations to the records of the dataset. See the [API documentation for `tf.contrib.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset) for details."
+        "Use the transformations functions like [`map`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#map), [`batch`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#batch), [`shuffle`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#shuffle) etc. to apply transformations to the records of the dataset. See the [API documentation for `tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) for details."
       ]
     },
     {
diff --git a/tensorflow/contrib/eager/python/network.py b/tensorflow/contrib/eager/python/network.py
index 1a5c6e8aec..c6e628b074 100644
--- a/tensorflow/contrib/eager/python/network.py
+++ b/tensorflow/contrib/eager/python/network.py
@@ -244,12 +244,6 @@ class Network(base.Layer):
     self._owned_layers = {}
     # The scope to use if we end up without a parent.
     self._default_parent_variable_scope = variable_scope.get_variable_scope()
-    # Hold on to the variable scope counts from init to check whether a scope
-    # with the name we want was ever created in our parent scope. Without this
-    # check we might have name collisions if the parent scope on init gets
-    # closed before build is called.
-    self._variable_scope_counts_on_init = (
-        variable_scope._get_default_variable_store().variable_scopes_count)
     self._custom_getter, self._deferred_restorations = (
         _make_custom_getter_for_deferred_restorations())
 
@@ -267,29 +261,18 @@ class Network(base.Layer):
 
   def _finalize_name(self, parent_network):
     if not self._name:
+      if not parent_network:
+        name_uid_map = base._get_default_graph_uid_map()
+      else:
+        name_uid_map = parent_network._sub_layer_name_uids
       # Were were not passed a name explicitly (or it was blank), so this is an
       # anonymous Network. We make up a unique name.
       if parent_network:
         avoid_names = parent_network._owned_layers
-        name_uid_map = parent_network._sub_layer_name_uids
       else:
-        name_uid_map = base._get_default_graph_uid_map()
-        # Figure out which names we have to avoid based on which variable scope
-        # we're nested in.
-        strip_name = self._default_parent_variable_scope.name
-        if strip_name:
-          strip_name += "/"
-        def _strip_on_init_scope(name):
-          if name.startswith(strip_name):
-            return name[len(strip_name):]
-          else:
-            return None
-        avoid_names = set(
-            _strip_on_init_scope(name)
-            for name in self._variable_scope_counts_on_init.keys() if name)
+        avoid_names = None
       self._name, self._base_name = self._make_unique_name(
-          name_uid_map=name_uid_map, avoid_names=avoid_names,
-          namespace=self._default_parent_variable_scope.name)
+          name_uid_map=name_uid_map, avoid_names=avoid_names)
     if self._first_parent is None or (self._first_parent  # False = no parent
                                       and self._first_parent() is None):
       # Save a pointer to the parent Network so that we can later check that the
@@ -319,13 +302,7 @@ class Network(base.Layer):
         parent_scope = first_parent._scope
       else:
         parent_scope = self._default_parent_variable_scope
-      with variable_scope.variable_scope(parent_scope) as parent_vs:
-        expected_scope_name = parent_vs.name + "/" + self._name
-        if expected_scope_name in self._variable_scope_counts_on_init:
-          raise ValueError(
-              ("A Network named '%s' already exists (or a variable_scope was "
-               "created with this name). Names must be unique.") % (
-                   self._name,))
+      with variable_scope.variable_scope(parent_scope):
         # Make sure variables with this prefix will be unique.
         with variable_scope.variable_scope(
             None, use_resource=True, default_name=self._name) as scope:
@@ -342,22 +319,25 @@ class Network(base.Layer):
                  "created with this name). Names must be unique.") % (
                      self._name,))
           if (first_parent
-              and scope_prefix[:-1] != first_parent.scope_name):
+              and scope_prefix[:-1] != first_parent._scope.name):
             raise ValueError(
                 ("Network variable names must match a nesting of sub-Network "
                  "names. Expected prefix '%s' from parent network, but got "
                  "'%s' when attempting to create a variable_scope for Network "
                  "'%s'. Likely an explicit variable_scope was inserted into "
                  "the nesting.") % (
-                     first_parent.scope_name,
+                     first_parent._scope.name,
                      scope_prefix[:-1],
                      self._name))
           elif not first_parent and scope_prefix:
             # For the case when this Network is not nested inside any other
-            # Network, but is in a variable_scope. This Network's name takes on
-            # the full variable scope prefix.
-            self._name = scope_name
-
+            # Network, but is in a variable_scope. This is an error for now.
+            raise ValueError(
+                "Creating Networks inside named variable_scopes is currently "
+                "not supported (to ensure that variable names match the names "
+                "of Networks in which they were first created). To set "
+                "options, try `with tf.variable_scope(''):`. If this "
+                "limitation bothers you, please file a feature request.")
       for non_network_sublayer in self._non_network_sublayers:
         self._set_scope_for_nonnetwork_sublayer(non_network_sublayer)
 
@@ -375,7 +355,8 @@ class Network(base.Layer):
         raise ValueError(
             ("The parent of a Layer added to Network %s was garbage collected "
              "before the Layer was built. If this limitation bothers you "
-             "please file a feature request.") %
+             "please, comment on "
+             "https://github.com/tensorflow/tensorflow/issues/14164.") %
             (self.name,))
       with variable_scope.variable_scope(parent_scope):
         # Horrid hack to make Layer variable names which are direct
@@ -439,9 +420,7 @@ class Network(base.Layer):
             # name, and we should respect it (subject to error checking).
             layer._name, layer._base_name = layer._make_unique_name(
                 name_uid_map=self._sub_layer_name_uids,
-                avoid_names=self._owned_layers
-                # No namespace required, since we've specified our own UID map.
-            )
+                avoid_names=self._owned_layers)
           layer._first_parent = weakref.ref(self)
         self._non_network_sublayers.append(layer)
     if (not layer.built
@@ -577,7 +556,7 @@ class Network(base.Layer):
     if os.path.isdir(save_path):
       # If we were passed a directory, default to naming based on the Network
       # name.
-      save_path = os.path.join(save_path, self.name.replace("/", "_"))
+      save_path = os.path.join(save_path, self.name)
     user_map_func = map_func
     if map_func is None:
       map_func = _make_prefix_stripping_map_fn(self.scope_name)
@@ -771,7 +750,7 @@ class Network(base.Layer):
     self._set_scope()  # scope_name should be available to map_funcs
     if os.path.isdir(save_path):
       # If we don't have a name yet, set no parent.
-      save_path = os.path.join(save_path, self.name.replace("/", "_"))
+      save_path = os.path.join(save_path, self.name)
     user_map_func = map_func
     if map_func is None:
       map_func = _make_prefix_stripping_map_fn(self.scope_name)
diff --git a/tensorflow/contrib/eager/python/network_test.py b/tensorflow/contrib/eager/python/network_test.py
index 1127055c05..14adbafe57 100644
--- a/tensorflow/contrib/eager/python/network_test.py
+++ b/tensorflow/contrib/eager/python/network_test.py
@@ -410,103 +410,19 @@ class NetworkTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testWrappingInVariableScope(self):
-    one = constant_op.constant([[1.]])
-    # Naming happens in the order of first build rather than the order of
-    # construction, but for clarity they're the same here and construction is
-    # annotated.
-    outside_net_before = MyNetwork()  # name=my_network_1
-    outside_net_before(one)
-    captured_scope = variable_scope.get_variable_scope()
     with variable_scope.variable_scope("outside_scope"):
-      net1 = MyNetwork()  # name=outside_scope/my_network_1
-      net1(one)
-      name_conflict1 = MyNetwork(name="name_conflict")  # fine, unique so far
-      name_conflict2 = MyNetwork(name="name_conflict")  # error on build
-      with variable_scope.variable_scope("inside_scope"):
-        # No issue here since the name is unique within its scope.
-        name_conflict3 = MyNetwork(name="name_conflict")
-      net2 = MyNetwork()  # name=outside_scope/my_network_3 to avoid the
-                          # variable_scope my_network_2 below.
-      vs_name_conflict = MyNetwork(name="vs_name_conflict")  # conflict below
-    with variable_scope.variable_scope("intervening_scope"):
-      with variable_scope.variable_scope(captured_scope):
-        with variable_scope.variable_scope("outside_scope"):
-          name_conflict4 = MyNetwork(name="name_conflict")  # error on build
-          with variable_scope.variable_scope("my_network_2"):
-            pass
-          with variable_scope.variable_scope("vs_name_conflict"):
-            pass
-          net3 = MyNetwork()  # name=outside_scope/my_network_4
-    name_conflict1(one)
-    with self.assertRaisesRegexp(
-        ValueError, "named 'name_conflict' already exists"):
-      name_conflict2(one)
-    name_conflict3(one)
-    net2(one)
-    with self.assertRaisesRegexp(
-        ValueError, "or a variable_scope was created with this name"):
-      vs_name_conflict(one)
-    with self.assertRaisesRegexp(
-        ValueError, "named 'name_conflict' already exists"):
-      name_conflict4(one)
-    self.assertEqual("outside_scope/name_conflict",
-                     name_conflict1.name)
-    self.assertStartsWith(
-        expected_start="outside_scope/name_conflict/dense_1/",
-        actual=name_conflict1.variables[0].name)
-    self.assertEqual("outside_scope/inside_scope/name_conflict",
-                     name_conflict3.name)
-    self.assertStartsWith(
-        expected_start="outside_scope/inside_scope/name_conflict/dense_1/",
-        actual=name_conflict3.variables[0].name)
-    self.assertEqual("outside_scope/my_network_1", net1.name)
-    self.assertStartsWith(
-        expected_start="outside_scope/my_network_1/dense_1/",
-        actual=net1.trainable_weights[0].name)
-    self.assertEqual("outside_scope/my_network_3", net2.name)
-    self.assertStartsWith(
-        expected_start="outside_scope/my_network_3/dense_1/",
-        actual=net2.trainable_weights[0].name)
-    net3(one)
-    self.assertEqual("outside_scope/my_network_4", net3.name)
-    self.assertStartsWith(
-        expected_start="outside_scope/my_network_4/dense_1/",
-        actual=net3.trainable_weights[0].name)
-    outside_net_after = MyNetwork()
-    outside_net_after(one)
-    self.assertEqual("my_network_1", outside_net_before.name)
-    self.assertStartsWith(
-        expected_start="my_network_1/dense_1/",
-        actual=outside_net_before.trainable_weights[0].name)
-    self.assertEqual("my_network_2", outside_net_after.name)
-    self.assertStartsWith(
-        expected_start="my_network_2/dense_1/",
-        actual=outside_net_after.trainable_weights[0].name)
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testVariableScopeStripping(self):
-    with variable_scope.variable_scope("scope1"):
-      with variable_scope.variable_scope("scope2"):
-        net = MyNetwork()
-    net(constant_op.constant([[2.0]]))
-    self.evaluate(net.variables[0].assign([[42.]]))
-    self.assertEqual(net.name, "scope1/scope2/my_network_1")
-    self.assertStartsWith(
-        expected_start="scope1/scope2/my_network_1/dense_1/",
-        actual=net.trainable_weights[0].name)
-    save_path = net.save(self.get_temp_dir())
-    self.assertIn("scope1_scope2_my_network_1", save_path)
-    restore_net = MyNetwork()
-    # Delayed restoration
-    restore_net.restore(save_path)
-    restore_net(constant_op.constant([[1.0]]))
-    self.assertAllEqual([[42.]],
-                        self.evaluate(restore_net.variables[0]))
-    self.evaluate(restore_net.variables[0].assign([[-1.]]))
-    # Immediate restoration
-    restore_net.restore(save_path)
-    self.assertAllEqual([[42.]],
-                        self.evaluate(restore_net.variables[0]))
+      net = MyNetwork()
+      one = constant_op.constant([[1.]])
+      with self.assertRaisesRegexp(
+          ValueError,
+          ("Creating Networks inside named variable_scopes is currently not "
+           "supported")):
+        net(one)
+      # Alternatively, we could re-name the Network to match the variable_scope:
+      # self.assertEqual("outside_scope/my_network_1", net.name)
+      # self.assertStartsWith(
+      #     expected_start="outside_scope/my_network_1/dense/",
+      #     actual=net.trainable_weights[0].name)
 
   @test_util.run_in_graph_and_eager_modes()
   def testLayerNamesRespected(self):
diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index bc67ef8354..6eb2cfdaca 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -204,13 +204,10 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:summary",
         "//tensorflow/python/estimator:head",
-        "//tensorflow/python/estimator:metric_keys",
         "//tensorflow/python/estimator:model_fn",
         "//tensorflow/python/saved_model:signature_constants",
         "@six_archive//:six",
@@ -232,7 +229,7 @@ py_test(
         "//tensorflow/python:string_ops",
         "//tensorflow/python/estimator:metric_keys",
         "//tensorflow/python/estimator:model_fn",
-        "//tensorflow/python/estimator:prediction_keys",
+        "//tensorflow/python/ops/losses",
         "//tensorflow/python/saved_model:signature_constants",
         "//third_party/py/numpy",
         "@six_archive//:six",
diff --git a/tensorflow/contrib/estimator/python/estimator/head.py b/tensorflow/contrib/estimator/python/estimator/head.py
index a9311a20f1..e344ee3c3e 100644
--- a/tensorflow/contrib/estimator/python/estimator/head.py
+++ b/tensorflow/contrib/estimator/python/estimator/head.py
@@ -28,7 +28,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
@@ -49,20 +48,7 @@ def multi_class_head(n_classes,
 
   Uses `sparse_softmax_cross_entropy` loss.
 
-  The head expects `logits` with shape `[D0, D1, ... DN, n_classes]`.
-  In many applications, the shape is `[batch_size, n_classes]`.
-
-  `labels` must be a dense `Tensor` with shape matching `logits`, namely
-  `[D0, D1, ... DN, 1]`. If `label_vocabulary` given, `labels` must be a string
-  `Tensor` with values from the vocabulary. If `label_vocabulary` is not given,
-  `labels` must be an integer `Tensor` with values specifying the class index.
-
-  If `weight_column` is specified, weights must be of shape
-  `[D0, D1, ... DN]`, or `[D0, D1, ... DN, 1]`.
-
-  The loss is the weighted sum over the input dimensions. Namely, if the input
-  labels have shape `[batch_size, 1]`, the loss is the weighted sum over
-  `batch_size`.
+  This head expects to be fed integer labels specifying the class index.
 
   Args:
     n_classes: Number of classes, must be greater than 2 (for 2 classes, use
@@ -71,11 +57,11 @@ def multi_class_head(n_classes,
       `tf.feature_column.numeric_column` defining feature column representing
       weights. It is used to down weight or boost examples during training. It
       will be multiplied by the loss of the example.
-    label_vocabulary: A list or tuple of strings representing possible label
-      values. If it is not given, that means labels are already encoded as an
-      integer within [0, n_classes). If given, labels must be of string type and
-      have any value in `label_vocabulary`. Note that errors will be raised if
-      `label_vocabulary` is not provided but labels are strings.
+    label_vocabulary: A list of strings represents possible label values. If it
+      is not given, that means labels are already encoded as integer within
+      [0, n_classes). If given, labels must be string type and have any value in
+      `label_vocabulary`. Also there will be errors if vocabulary is not
+      provided and labels are string.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -98,20 +84,7 @@ def binary_classification_head(
 
   This head uses `sigmoid_cross_entropy_with_logits` loss.
 
-  The head expects `logits` with shape `[D0, D1, ... DN, 1]`.
-  In many applications, the shape is `[batch_size, 1]`.
-
-  `labels` must be a dense `Tensor` with shape matching `logits`, namely
-  `[D0, D1, ... DN, 1]`. If `label_vocabulary` given, `labels` must be a string
-  `Tensor` with values from the vocabulary. If `label_vocabulary` is not given,
-  `labels` must be float `Tensor` with values in the interval `[0, 1]`.
-
-  If `weight_column` is specified, weights must be of shape
-  `[D0, D1, ... DN]`, or `[D0, D1, ... DN, 1]`.
-
-  The loss is the weighted sum over the input dimensions. Namely, if the input
-  labels have shape `[batch_size, 1]`, the loss is the weighted sum over
-  `batch_size`.
+  This head expects to be fed float labels of shape `(batch_size, 1)`.
 
   Args:
     weight_column: A string or a `_NumericColumn` created by
@@ -123,11 +96,11 @@ def binary_classification_head(
       generated for each threshold value. This threshold is applied to the
       logistic values to determine the binary classification (i.e., above the
       threshold is `true`, below is `false`.
-    label_vocabulary: A list or tuple of strings representing possible label
-      values. If it is not given, labels must be float with values within
-      [0, 1]. If given, labels must be string type and have any value in
-      `label_vocabulary`. Note that errors will be raised if `label_vocabulary`
-      is not provided but labels are strings.
+    label_vocabulary: A list of strings represents possible label values. If it
+      is not given, that means labels are already encoded within [0, 1]. If
+      given, labels must be string type and have any value in
+      `label_vocabulary`. Also there will be errors if vocabulary is not
+      provided and labels are string.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -147,22 +120,9 @@ def binary_classification_head(
 def regression_head(weight_column=None,
                     label_dimension=1,
                     name=None):
-  """Creates a `_Head` for regression using the `mean_squared_error` loss.
-
-  The loss is the weighted sum over all input dimensions. Namely, if the input
-  labels have shape `[batch_size, label_dimension]`, the loss is the weighted
-  sum over both `batch_size` and `label_dimension`.
-
-  The head expects `logits` with shape `[D0, D1, ... DN, label_dimension]`.
-  In many applications, the shape is `[batch_size, label_dimension]`.
-
-  The `labels` shape must match `logits`, namely
-  `[D0, D1, ... DN, label_dimension]`. If `label_dimension=1`, shape
-  `[D0, D1, ... DN]` is also supported.
+  """Creates a `_Head` for regression using the mean squared loss.
 
-  If `weight_column` is specified, weights must be of shape
-  `[D0, D1, ... DN]`, `[D0, D1, ... DN, 1]` or
-  `[D0, D1, ... DN, label_dimension]`.
+  Uses `mean_squared_error` loss.
 
   Args:
     weight_column: A string or a `_NumericColumn` created by
@@ -196,29 +156,15 @@ def multi_label_head(n_classes,
   or more associated labels, from a discrete set. This is distinct from
   `multi_class_head` which has exactly one label per example.
 
-  Uses `sigmoid_cross_entropy` loss average over classes and weighted sum over
-  the batch. Namely, if the input logits have shape `[batch_size, n_classes]`,
-  the loss is the average over `n_classes` and the weighted sum over
-  `batch_size`.
-
-  The head expects `logits` with shape `[D0, D1, ... DN, n_classes]`. In many
-  applications, the shape is `[batch_size, label_n_classes]`.
-
-  Labels can be:
-  * A multi-hot tensor of shape `[D0, D1, ... DN, n_classes]`
-  * An integer `SparseTensor` of class indices. The `dense_shape` must be
-    `[D0, D1, ... DN, ?]` and the values within `[0, n_classes)`.
-  * If `label_vocabulary` is given, a string `SparseTensor`. The `dense_shape`
-    must be `[D0, D1, ... DN, ?]` and the values within `label_vocabulary`.
-
-  If `weight_column` is specified, weights must be of shape
-  `[D0, D1, ... DN]`, or `[D0, D1, ... DN, 1]`.
+  Uses `sigmoid_cross_entropy` loss averaged over classes. Expects labels as a
+  multi-hot tensor of shape `[batch_size, n_classes]`, or as an integer
+  `SparseTensor` of class indices.
 
   Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
   `(labels, logits, features)` as arguments and returns unreduced loss with
-  shape `[D0, D1, ... DN, 1]`. `loss_fn` must support indicator `labels` with
-  shape `[D0, D1, ... DN, n_classes]`. Namely, the head applies
-  `label_vocabulary` to the input labels before passing them to `loss_fn`.
+  shape `[batch_size, 1]`. `loss_fn` must support indicator `labels` with shape
+  `[batch_size, n_classes]`. Namely, the head applies `label_vocabulary` to the
+  input labels before passing them to `loss_fn`.
 
   Args:
     n_classes: Number of classes, must be greater than 1 (for 1 class, use
@@ -245,7 +191,7 @@ def multi_label_head(n_classes,
     An instance of `_Head` for multi-label classification.
 
   Raises:
-    ValueError: if `n_classes`, `thresholds`, or `loss_fn` is invalid.
+    ValueError: if `n_classes` or `thresholds` is invalid.
   """
   thresholds = tuple(thresholds) if thresholds else tuple()
   if n_classes is None or n_classes < 2:
@@ -313,36 +259,26 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
             indices=labels.indices,
             values=label_ids_values,
             dense_shape=labels.dense_shape)
-        return math_ops.to_int64(
-            sparse_ops.sparse_to_indicator(label_ids, self._n_classes))
       else:
-        err_msg = (
-            r'labels must be an integer SparseTensor with values in '
-            r'[0, {})'.format(self._n_classes))
-        assert_int = check_ops.assert_integer(
-            labels.values, message=err_msg)
-        assert_less = check_ops.assert_less(
-            labels.values,
-            ops.convert_to_tensor(self._n_classes, dtype=labels.dtype),
-            message=err_msg)
-        assert_greater = check_ops.assert_non_negative(
-            labels.values, message=err_msg)
-        with ops.control_dependencies(
-            [assert_int, assert_less, assert_greater]):
-          return math_ops.to_int64(
-              sparse_ops.sparse_to_indicator(labels, self._n_classes))
-    err_msg = (
-        r'labels must be an integer indicator Tensor with values in [0, 1]')
-    return head_lib._assert_range(labels, 2, message=err_msg)  # pylint:disable=protected-access,
+        label_ids = labels
+      return math_ops.to_int64(
+          sparse_ops.sparse_to_indicator(label_ids, self._n_classes))
+    msg = ('labels shape must be [batch_size, {}]. '
+           'Given: ').format(self._n_classes)
+    labels_shape = array_ops.shape(labels)
+    check_rank_op = control_flow_ops.Assert(
+        math_ops.equal(array_ops.rank(labels), 2),
+        data=[msg, labels_shape])
+    check_label_dim = control_flow_ops.Assert(
+        math_ops.equal(labels_shape[-1], self._n_classes),
+        data=[msg, labels_shape])
+    with ops.control_dependencies([check_rank_op, check_label_dim]):
+      return array_ops.identity(labels)
 
   def create_loss(self, features, mode, logits, labels):
     """See `Head`."""
     del mode  # Unused for this head.
-    logits = ops.convert_to_tensor(logits)
     processed_labels = self._process_labels(labels)
-    processed_labels = head_lib._check_dense_labels_match_logits_and_reshape(  # pylint:disable=protected-access
-        labels=processed_labels, logits=logits,
-        expected_labels_dimension=self.logits_dimension)
     if self._loss_fn:
       unweighted_loss = _call_loss_fn(
           loss_fn=self._loss_fn, labels=processed_labels, logits=logits,
@@ -354,8 +290,7 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
       # Averages loss over classes.
       unweighted_loss = math_ops.reduce_mean(
           unweighted_loss, axis=-1, keep_dims=True)
-    weights = head_lib._get_weights_and_check_match_logits(  # pylint:disable=protected-access,
-        features=features, weight_column=self._weight_column, logits=logits)
+    weights = head_lib._weights(features, self._weight_column)  # pylint:disable=protected-access,
     weighted_sum_loss = losses.compute_weighted_loss(
         unweighted_loss, weights=weights, reduction=losses.Reduction.SUM)
     # _weights() can return 1.
@@ -370,7 +305,7 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
       self, features, mode, logits, labels=None, train_op_fn=None):
     """See `Head`."""
     with ops.name_scope(self._name, 'head'):
-      logits = head_lib._check_logits_final_dim(logits, self.logits_dimension)  # pylint:disable=protected-access
+      logits = head_lib._check_logits(logits, self.logits_dimension)  # pylint:disable=protected-access
 
       # Predict.
       pred_keys = prediction_keys.PredictionKeys
@@ -400,8 +335,6 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
 
       # Eval.
       if mode == model_fn.ModeKeys.EVAL:
-        weights = head_lib._get_weights_and_check_match_logits(  # pylint:disable=protected-access,
-            features=features, weight_column=self._weight_column, logits=logits)
         return model_fn.EstimatorSpec(
             mode=model_fn.ModeKeys.EVAL,
             predictions=predictions,
@@ -409,7 +342,7 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
             eval_metric_ops=self._eval_metric_ops(
                 labels=processed_labels,
                 probabilities=probabilities,
-                weights=weights,
+                weights=head_lib._weights(features, self._weight_column),  # pylint:disable=protected-access,
                 weighted_sum_loss=weighted_sum_loss,
                 example_weight_sum=example_weight_sum))
 
diff --git a/tensorflow/contrib/estimator/python/estimator/head_test.py b/tensorflow/contrib/estimator/python/estimator/head_test.py
index d1cf909004..fd8c53f6a9 100644
--- a/tensorflow/contrib/estimator/python/estimator/head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/head_test.py
@@ -316,14 +316,13 @@ class MultiLabelHead(test.TestCase):
       _initialize_variables(self, monitored_session.Scaffold())
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
-          r'\[expected_labels_shape: \] \[2 2\] \[labels_shape: \] \[2 1\]'):
+          r'labels shape must be \[batch_size, 2\]\. Given: \] \[2 1\]'):
         actual_weighted_sum_loss.eval({
             labels_placeholder: np.array([[1], [1]], dtype=np.int64)
         })
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
-          r'labels shape must be \[D0, D1, ... DN, 2\]\..*'
-          r'\[Received shape: \] \[2\]'):
+          r'labels shape must be \[batch_size, 2\]\. Given: \] \[2\]'):
         actual_weighted_sum_loss.eval({
             labels_placeholder: np.array([1, 1], dtype=np.int64)
         })
@@ -388,11 +387,9 @@ class MultiLabelHead(test.TestCase):
           logits=np.array([[-10., 10.], [-15., 10.]], dtype=np.float32),
           labels=None)
 
-  def _test_eval(
-      self, head, logits, labels, expected_loss, expected_metrics,
-      features=None):
+  def _test_eval(self, head, logits, labels, expected_loss, expected_metrics):
     spec = head.create_estimator_spec(
-        features=features or {},
+        features={'x': np.array(((42,),), dtype=np.int32)},
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
         labels=labels)
@@ -658,54 +655,6 @@ class MultiLabelHead(test.TestCase):
           labels=None,
           train_op_fn=_no_op_train_fn)
 
-  def test_train_invalid_indicator_labels(self):
-    head = head_lib.multi_label_head(n_classes=2)
-    logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
-    # The value 2 is outside the allowed range.
-    labels = np.array([[2, 0], [1, 1]], dtype=np.int64)
-    def _train_op_fn(loss):
-      del loss
-      return control_flow_ops.no_op()
-
-    spec = head.create_estimator_spec(
-        features={},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-    with self.test_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'labels must be an integer indicator Tensor with values in '
-          r'\[0, 1\]'):
-        sess.run(spec.loss)
-
-  def test_train_invalid_sparse_labels(self):
-    head = head_lib.multi_label_head(n_classes=2)
-    logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
-    # The value 2 is outside the allowed range.
-    labels = sparse_tensor.SparseTensor(
-        values=[2, 0, 1],
-        indices=[[0, 0], [1, 0], [1, 1]],
-        dense_shape=[2, 2])
-    def _train_op_fn(loss):
-      del loss
-      return control_flow_ops.no_op()
-
-    spec = head.create_estimator_spec(
-        features={},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-    with self.test_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'labels must be an integer SparseTensor with values in \[0, 2\)'):
-        sess.run(spec.loss)
-
   def _test_train(self, head, logits, labels, expected_loss):
     expected_train_result = 'my_train_op'
     def _train_op_fn(loss):
@@ -842,153 +791,6 @@ class MultiLabelHead(test.TestCase):
           metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 3,
       }, summary_str, tol)
 
-  def test_multi_dim_weighted_train_create_loss(self):
-    """Logits and labels of shape [2, 2, 3], weights [2, 2]."""
-    head = head_lib.multi_label_head(n_classes=3, weight_column='weights')
-
-    logits = np.array([[[-10., 10., -10.], [10., -10., 10.]],
-                       [[-12., 12., -12.], [12., -12., 12.]]], dtype=np.float32)
-    labels = np.array([[[1, 0, 0], [1, 0, 0]],
-                       [[0, 1, 1], [0, 1, 1]]], dtype=np.int64)
-    weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
-    # loss = [[10 + 10 + 0, 0 + 0 + 10], [0 + 0 + 12, 12 + 12 + 0]] / 3
-    #      = [[20/3, 10/3], [4, 8]]
-    # weighted_sum_loss = 1*20/3 + 1.5*10/3 + 2*4 + 2.5*8 = 39.6667
-    expected_weighted_sum_loss = 39.6667
-    expected_example_weight_sum = np.sum(weights)
-    actual_weighted_sum_loss, actual_example_weight_sum, _ = head.create_loss(
-        features={'weights': weights},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels)
-    atol = 1.e-3
-    with self.test_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(
-          expected_weighted_sum_loss, actual_weighted_sum_loss.eval(),
-          atol=atol)
-      self.assertAllClose(
-          expected_example_weight_sum, actual_example_weight_sum.eval(),
-          atol=atol)
-
-  def test_multi_dim_weighted_train(self):
-    """Logits and labels of shape [2, 2, 3], weights [2, 2]."""
-    head = head_lib.multi_label_head(n_classes=3, weight_column='weights')
-
-    logits = np.array([[[-10., 10., -10.], [10., -10., 10.]],
-                       [[-12., 12., -12.], [12., -12., 12.]]], dtype=np.float32)
-    labels = np.array([[[1, 0, 0], [1, 0, 0]],
-                       [[0, 1, 1], [0, 1, 1]]], dtype=np.int64)
-    weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
-    # loss = [[10 + 10 + 0, 0 + 0 + 10], [0 + 0 + 12, 12 + 12 + 0]] / 3
-    #      = [[20/3, 10/3], [4, 8]]
-    # weighted_sum_loss = 1*20/3 + 1.5*10/3 + 2*4 + 2.5*8 = 39.6667
-    expected_loss = 39.6667
-    expected_train_result = 'my_train_op'
-    def _train_op_fn(loss):
-      return string_ops.string_join(
-          [constant_op.constant(expected_train_result),
-           string_ops.as_string(loss, precision=3)])
-
-    spec = head.create_estimator_spec(
-        features={'weights': weights},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-
-    atol = 1.e-3
-    with self.test_session() as sess:
-      _initialize_variables(self, monitored_session.Scaffold())
-      loss, train_result = sess.run((spec.loss, spec.train_op))
-      self.assertAllClose(expected_loss, loss, atol=atol)
-      self.assertEqual(
-          six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
-          train_result)
-
-  def test_multi_dim_weights_wrong_inner_dim(self):
-    """Logits and labels of shape [2, 2, 3], weights [2, 1]."""
-    head = head_lib.multi_label_head(n_classes=3, weight_column='weights')
-
-    logits = np.array([[[-10., 10., -10.], [10., -10., 10.]],
-                       [[-12., 12., -12.], [12., -12., 12.]]], dtype=np.float32)
-    labels = np.array([[[1, 0, 0], [1, 0, 0]],
-                       [[0, 1, 1], [0, 1, 1]]], dtype=np.int64)
-    weights = np.array([[1.], [2.]], dtype=np.float32)
-    def _train_op_fn(loss):
-      del loss
-      return control_flow_ops.no_op()
-
-    spec = head.create_estimator_spec(
-        features={'weights': weights},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-    with self.test_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[logits_shape: \] \[2 2 3\] \[weights_shape: \] \[2 1\]'):
-        spec.loss.eval()
-
-  def test_multi_dim_weights_wrong_outer_dim(self):
-    """Logits and labels of shape [2, 2, 3], weights [2, 2, 3]."""
-    head = head_lib.multi_label_head(n_classes=3, weight_column='weights')
-
-    logits = np.array([[[-10., 10., -10.], [10., -10., 10.]],
-                       [[-12., 12., -12.], [12., -12., 12.]]], dtype=np.float32)
-    labels = np.array([[[1, 0, 0], [1, 0, 0]],
-                       [[0, 1, 1], [0, 1, 1]]], dtype=np.int64)
-    weights = np.array([[[1., 1., 1.], [1.5, 1.5, 1.5]],
-                        [[2., 2., 2.], [2.5, 2.5, 2.5]]], dtype=np.float32)
-    weights_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    def _train_op_fn(loss):
-      del loss
-      return control_flow_ops.no_op()
-
-    spec = head.create_estimator_spec(
-        features={'weights': weights_placeholder},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-    with self.test_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[logits_shape: \] \[2 2 3\] \[weights_shape: \] \[2 2 3\]'):
-        spec.loss.eval({weights_placeholder: weights})
-
-  def test_multi_dim_weighted_eval(self):
-    """Logits and labels of shape [2, 2, 3], weights [2, 2]."""
-    head = head_lib.multi_label_head(n_classes=3, weight_column='weights')
-
-    logits = np.array([[[-10., 10., -10.], [10., -10., 10.]],
-                       [[-12., 12., -12.], [12., -12., 12.]]], dtype=np.float32)
-    labels = np.array([[[1, 0, 0], [1, 0, 0]],
-                       [[0, 1, 1], [0, 1, 1]]], dtype=np.int64)
-    weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
-    # loss = [[10 + 10 + 0, 0 + 0 + 10], [0 + 0 + 12, 12 + 12 + 0]] / 3
-    #      = [[20/3, 10/3], [4, 8]]
-    # weighted_sum_loss = 1*20/3 + 1.5*10/3 + 2*4 + 2.5*8 = 39.6667
-    expected_loss = 39.6667
-    keys = metric_keys.MetricKeys
-    expected_metrics = {
-        keys.LOSS_MEAN: expected_loss / np.sum(weights),
-        # auc and auc_pr cannot be reliably calculated for only 4 samples, but
-        # this assert tests that the algorithm remains consistent.
-        keys.AUC: 0.4977,
-        keys.AUC_PR: 0.6645,
-    }
-    self._test_eval(
-        head=head,
-        features={'weights': weights},
-        logits=logits,
-        labels=labels,
-        expected_loss=expected_loss,
-        expected_metrics=expected_metrics)
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/multi_head.py b/tensorflow/contrib/estimator/python/estimator/multi_head.py
index 73bae5acf9..69dbfcee62 100644
--- a/tensorflow/contrib/estimator/python/estimator/multi_head.py
+++ b/tensorflow/contrib/estimator/python/estimator/multi_head.py
@@ -22,13 +22,10 @@ import six
 
 from tensorflow.python.estimator import model_fn
 from tensorflow.python.estimator.canned import head as head_lib
-from tensorflow.python.estimator.canned import metric_keys
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.summary import summary
 
 
 _DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
@@ -75,23 +72,6 @@ def multi_head(heads, head_weights=None):
   estimator.train(input_fn=input_fn, steps=100)
   ```
 
-  Also supports `logits` as a `Tensor` of shape
-  `[D0, D1, ... DN, logits_dimension]`. It will split the `Tensor` along the
-  last dimension and distribute it appropriately among the heads. E.g.:
-
-  ```python
-  def model_fn(features, labels, mode):
-    # Create simple heads and specify head name.
-    head1 = multi_class_head(n_classes=3, name='head1')
-    head2 = binary_classification_head(name='head2')
-    # Create multi-head from two simple heads.
-    head = multi_head([head1, head2])
-    # Create logits for the multihead.
-    logits = logit_fn(logits_dimension=head.logits_dimension)
-    # Return the merged EstimatorSpec
-    return head.create_estimator_spec(..., logits=logits, ...)
-  ```
-
   Args:
     heads: List or tuple of `_Head` instances. All heads must have `name`
       specified. The first head in the list is the default used at serving time.
@@ -181,17 +161,18 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
 
   def create_loss(self, features, mode, logits, labels):
     """See `Head`."""
-    if isinstance(logits, dict):
-      logits_dict = logits
-    else:
-      logits_dict = self._split_logits(logits)
+    # TODO(roumposg): Add support for logits as single Tensor (with
+    # _split_logits utility).
+    if not isinstance(logits, dict):
+      raise ValueError('logits must be a dict.  Single Tensor support coming '
+                       'soon.')
     weighted_sum_losses = []
     example_weight_sums = []
     labels_by_head = {}
     for head in self._heads:
       (weighted_sum_loss,
        example_weight_sum, processed_labels) = head.create_loss(
-           features, mode, logits_dict[head.name], labels[head.name])
+           features, mode, logits[head.name], labels[head.name])
       weighted_sum_losses.append(weighted_sum_loss)
       example_weight_sums.append(example_weight_sum)
       labels_by_head[head.name] = processed_labels
@@ -224,10 +205,10 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
   def create_estimator_spec(
       self, features, mode, logits, labels=None, train_op_fn=None):
     """See `_Head`."""
-    if isinstance(logits, dict):
-      logits_dict = logits
-    else:
-      logits_dict = self._split_logits(logits)
+    # TODO(roumposg): Add support for logits as single Tensor (with
+    # _split_logits utility).
+    if not isinstance(logits, dict):
+      raise ValueError('logits must be a dict. Given: {}'.format(logits))
     if labels and not isinstance(labels, dict):
       raise ValueError('labels must be a dict. Given: {}'.format(labels))
 
@@ -238,42 +219,22 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
           head.create_estimator_spec(
               features=features,
               mode=mode,
-              logits=logits_dict[head_name],
+              logits=logits[head_name],
               labels=labels[head_name] if labels else None,
               train_op_fn=_no_op_train_fn))
 
+    # TODO(roumposg): Add LOSS and LOSS_MEAN summaries for the total head-
+    # combined loss.
     if mode == model_fn.ModeKeys.TRAIN:
       if train_op_fn is None:
         raise ValueError('train_op_fn can not be None in TRAIN mode.')
-      spec = self._merge_train(all_estimator_spec, train_op_fn)
-      with ops.name_scope(''):
-        summary.scalar(metric_keys.MetricKeys.LOSS, spec.loss)
-      return spec
+      return self._merge_train(all_estimator_spec, train_op_fn)
     if mode == model_fn.ModeKeys.PREDICT:
       return self._merge_predict(all_estimator_spec)
     if mode == model_fn.ModeKeys.EVAL:
       return self._merge_eval(all_estimator_spec)
     raise ValueError('mode={} unrecognized'.format(mode))
 
-  def _split_logits(self, logits):
-    """Splits logits along the last dimension and returns a dict."""
-    logits_dict = {}
-    with ops.name_scope(None, 'split_logits', values=[logits]):
-      logits = ops.convert_to_tensor(logits)
-      batch_shape = array_ops.shape(logits)[:-1]
-      zeros_like_batch_shape = array_ops.zeros_like(batch_shape)
-      minus_ones_like_batch_shape = -1 * array_ops.ones_like(batch_shape)
-      begin_idx = 0
-      for head in self._heads:
-        begin_tensor = array_ops.concat(
-            [zeros_like_batch_shape, [begin_idx]], axis=0)
-        size_tensor = array_ops.concat(
-            [minus_ones_like_batch_shape, [head.logits_dimension]], axis=0)
-        logits_dict[head.name] = array_ops.slice(
-            logits, begin=begin_tensor, size=size_tensor)
-        begin_idx += head.logits_dimension
-    return logits_dict
-
   def _merge_train(self, all_estimator_spec, train_op_fn):
     """Merges list of `EstimatorSpec` for training.
 
diff --git a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py b/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
index 8d51a298b2..16177aebd5 100644
--- a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
@@ -106,8 +106,7 @@ class MultiHeadTest(test.TestCase):
     multi_head = multi_head_lib.multi_head([head1, head2])
     self.assertEqual('head1_head2', multi_head.name)
 
-  def test_predict_two_heads_logits_dict(self):
-    """Tests predict with logits as dict."""
+  def test_predict_two_heads(self):
     head1 = head_lib.multi_label_head(n_classes=2, name='head1')
     head2 = head_lib.multi_label_head(n_classes=3, name='head2')
     multi_head = multi_head_lib.multi_head([head1, head2])
@@ -159,111 +158,6 @@ class MultiHeadTest(test.TestCase):
           expected_probabilities['head2'],
           sess.run(spec.export_outputs['head2'].scores))
 
-  def test_predict_two_heads_logits_tensor(self):
-    """Tests predict with logits as Tensor."""
-    head1 = head_lib.multi_label_head(n_classes=2, name='head1')
-    head2 = head_lib.multi_label_head(n_classes=3, name='head2')
-    multi_head = multi_head_lib.multi_head([head1, head2])
-
-    logits = np.array(
-        [[-1., 1., 2., -2., 2.], [-1.5, 1., -3., 2., -2.]], dtype=np.float32)
-    expected_logits1 = np.array([[-1., 1.], [-1.5, 1.]], dtype=np.float32)
-    expected_logits2 = np.array([[2., -2., 2.], [-3., 2., -2.]],
-                                dtype=np.float32)
-    expected_probabilities = {
-        'head1': _sigmoid(expected_logits1),
-        'head2': _sigmoid(expected_logits2),
-    }
-
-    spec = multi_head.create_estimator_spec(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.PREDICT,
-        logits=logits)
-
-    self.assertItemsEqual(
-        (_DEFAULT_SERVING_KEY, 'head1', 'classification/head1', 'predict/head1',
-         'head2', 'classification/head2', 'predict/head2'),
-        spec.export_outputs.keys())
-
-    # Assert predictions and export_outputs.
-    with self.test_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      predictions = sess.run(spec.predictions)
-      self.assertAllClose(
-          expected_logits1,
-          predictions[('head1', prediction_keys.PredictionKeys.LOGITS)])
-      self.assertAllClose(
-          expected_logits2,
-          predictions[('head2', prediction_keys.PredictionKeys.LOGITS)])
-      self.assertAllClose(
-          expected_probabilities['head1'],
-          predictions[('head1', prediction_keys.PredictionKeys.PROBABILITIES)])
-      self.assertAllClose(
-          expected_probabilities['head2'],
-          predictions[('head2', prediction_keys.PredictionKeys.PROBABILITIES)])
-
-      self.assertAllClose(
-          expected_probabilities['head1'],
-          sess.run(spec.export_outputs[_DEFAULT_SERVING_KEY].scores))
-      self.assertAllClose(
-          expected_probabilities['head1'],
-          sess.run(spec.export_outputs['head1'].scores))
-      self.assertAllClose(
-          expected_probabilities['head2'],
-          sess.run(spec.export_outputs['head2'].scores))
-
-  def test_predict_two_heads_logits_tensor_multi_dim(self):
-    """Tests predict with multi-dimensional logits of shape [2, 2, 5]."""
-    head1 = head_lib.regression_head(label_dimension=2, name='head1')
-    head2 = head_lib.regression_head(label_dimension=3, name='head2')
-    multi_head = multi_head_lib.multi_head([head1, head2])
-
-    logits = np.array(
-        [[[-1., 1., 2., -2., 2.], [-1., 1., 2., -2., 2.]],
-         [[-1.5, 1., -3., 2., -2.], [-1.5, 1., -3., 2., -2.]]],
-        dtype=np.float32)
-    expected_logits1 = np.array(
-        [[[-1., 1.], [-1., 1.]],
-         [[-1.5, 1.], [-1.5, 1.]]],
-        dtype=np.float32)
-    expected_logits2 = np.array(
-        [[[2., -2., 2.], [2., -2., 2.]],
-         [[-3., 2., -2.], [-3., 2., -2.]]],
-        dtype=np.float32)
-
-    spec = multi_head.create_estimator_spec(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.PREDICT,
-        logits=logits)
-
-    self.assertItemsEqual(
-        (_DEFAULT_SERVING_KEY, 'head1', 'regression/head1', 'predict/head1',
-         'head2', 'regression/head2', 'predict/head2'),
-        spec.export_outputs.keys())
-
-    # Assert predictions and export_outputs.
-    with self.test_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      predictions = sess.run(spec.predictions)
-      self.assertAllClose(
-          expected_logits1,
-          predictions[('head1', prediction_keys.PredictionKeys.PREDICTIONS)])
-      self.assertAllClose(
-          expected_logits2,
-          predictions[('head2', prediction_keys.PredictionKeys.PREDICTIONS)])
-
-      self.assertAllClose(
-          expected_logits1,
-          sess.run(spec.export_outputs[_DEFAULT_SERVING_KEY].value))
-      self.assertAllClose(
-          expected_logits1,
-          sess.run(spec.export_outputs['head1'].value))
-      self.assertAllClose(
-          expected_logits2,
-          sess.run(spec.export_outputs['head2'].value))
-
   def test_eval_two_heads_with_weights(self):
     head1 = head_lib.multi_label_head(n_classes=2, name='head1')
     head2 = head_lib.multi_label_head(n_classes=3, name='head2')
@@ -390,84 +284,6 @@ class MultiHeadTest(test.TestCase):
       # example_weight_sum = 1 * (1 + 2) + 2 * (2 + 3) = 13
       self.assertAllClose(13., example_weight_sum.eval(), rtol=tol, atol=tol)
 
-  def test_train_create_loss_logits_tensor(self):
-    """Tests create_loss with logits Tensor."""
-    weights1 = np.array([[1.], [2.]], dtype=np.float32)
-    weights2 = np.array([[2.], [3.]])
-    head1 = head_lib.multi_label_head(n_classes=2, name='head1',
-                                      weight_column='weights1')
-    head2 = head_lib.multi_label_head(n_classes=3, name='head2',
-                                      weight_column='weights2')
-    multi_head = multi_head_lib.multi_head(
-        [head1, head2], head_weights=[1., 2.])
-
-    logits = np.array([[-10., 10., 20., -20., 20.],
-                       [-15., 10., -30., 20., -20.]], dtype=np.float32)
-    labels = {
-        'head1': np.array([[1, 0], [1, 1]], dtype=np.int64),
-        'head2': np.array([[0, 1, 0], [1, 1, 0]], dtype=np.int64),
-    }
-    weighted_sum_loss, example_weight_sum, _ = multi_head.create_loss(
-        features={
-            'x': np.array(((42,),), dtype=np.int32),
-            'weights1': weights1,
-            'weights2': weights2
-        },
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels)
-    tol = 1e-3
-    with self.test_session():
-      # loss of the first head is [[(10 + 10) / 2], [(15 + 0) / 2]]
-      # = [10, 7.5]
-      # weighted_sum_loss = 1 * 10 + 2 * 7.5 = 25
-      # loss of the second head is [[(20 + 20 + 20) / 3], [(30 + 0 + 0) / 3]]
-      # = [20, 10]
-      # weighted_sum_loss = 2 * 20 + 3 * 10 = 70
-      # head-weighted merge = 1 * 25 + 2 * 70 = 165
-      self.assertAllClose(165, weighted_sum_loss.eval(), rtol=tol, atol=tol)
-      # example_weight_sum = 1 * (1 + 2) + 2 * (2 + 3) = 13
-      self.assertAllClose(13., example_weight_sum.eval(), rtol=tol, atol=tol)
-
-  def test_train_create_loss_logits_tensor_multi_dim(self):
-    """Tests create_loss with multi-dimensional logits of shape [2, 2, 5]."""
-    head1 = head_lib.regression_head(label_dimension=2, name='head1')
-    head2 = head_lib.regression_head(label_dimension=3, name='head2')
-    multi_head = multi_head_lib.multi_head([head1, head2])
-
-    logits = np.array(
-        [[[-1., 1., 2., -2., 2.], [-1., 1., 2., -2., 2.]],
-         [[-1.5, 1.5, -2., 2., -2.], [-1.5, 1.5, -2., 2., -2.]]],
-        dtype=np.float32)
-    labels = {
-        'head1': np.array([[[1., 0.], [1., 0.]],
-                           [[1.5, 1.5], [1.5, 1.5]]], dtype=np.float32),
-        'head2': np.array([[[0., 1., 0.], [0., 1., 0.]],
-                           [[2., 2., 0.], [2., 2., 0.]]], dtype=np.float32),
-    }
-    # Loss for the first head:
-    # loss1 = (1+1)^2 + (0-1)^2 + (1+1)^2 + (0-1)^2 +
-    #         (1.5+1.5)^2 + (1.5-1.5)^2 + (1.5+1.5)^2 + (1.5-1.5)^2
-    #       = 28
-    # Loss for the second head:
-    # loss2 = (0-2)^2 + (1+2)^2 + (0-2)^2 + (0-2)^2 + (1+2)^2 + (0-2)^2 +
-    #         (2+2)^2 + (2-2)^2 + (0+2)^2 + (2+2)^2 + (2-2)^2 + (0+2)^2
-    #       = 74
-    expected_weighted_sum_loss = 28. + 74.
-
-    weighted_sum_loss, example_weight_sum, _ = multi_head.create_loss(
-        features={},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels)
-    tol = 1e-3
-    with self.test_session():
-      self.assertAllClose(
-          expected_weighted_sum_loss, weighted_sum_loss.eval(),
-          rtol=tol, atol=tol)
-      self.assertAllClose(
-          2. * 2. * 5., example_weight_sum.eval(), rtol=tol, atol=tol)
-
   def test_train_one_head(self):
     head1 = head_lib.multi_label_head(n_classes=2, name='head1')
     multi_head = multi_head_lib.multi_head([head1])
@@ -511,7 +327,6 @@ class MultiHeadTest(test.TestCase):
           six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
           train_result)
       _assert_simple_summaries(self, {
-          metric_keys.MetricKeys.LOSS: expected_loss,
           metric_keys.MetricKeys.LOSS + '/head1': expected_loss,
           # Average loss over examples.
           metric_keys.MetricKeys.LOSS_MEAN + '/head1': expected_loss / 2,
@@ -572,7 +387,6 @@ class MultiHeadTest(test.TestCase):
           six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
           train_result)
       _assert_simple_summaries(self, {
-          metric_keys.MetricKeys.LOSS: expected_loss,
           metric_keys.MetricKeys.LOSS + '/head1': expected_loss_head1,
           metric_keys.MetricKeys.LOSS + '/head2': expected_loss_head2,
           # Average loss over examples.
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
index 0848c5f62f..7005a647db 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
@@ -34,12 +34,10 @@ from tensorflow.python.estimator import util
 from tensorflow.python.estimator.export import export_output as export_output_lib
 from tensorflow.python.framework import device as framework_device
 from tensorflow.python.framework import ops as ops_lib
-from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients as gradients_lib
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
@@ -185,17 +183,10 @@ def _split_batch(features, labels, number_of_shards, device):
   """Split input features and labes into batches."""
 
   def split_dictionary(dictionary):
-    """Split a dictionary into shards."""
     shards = [{} for _ in range(number_of_shards)]
     for name, tensor in six.iteritems(dictionary):
-      if isinstance(tensor, sparse_tensor.SparseTensor):
-        for i, shard in enumerate(
-            sparse_ops.sparse_split(
-                sp_input=tensor, num_split=number_of_shards, axis=0)):
-          shards[i][name] = shard
-      else:
-        for i, shard in enumerate(array_ops.split(tensor, number_of_shards)):
-          shards[i][name] = shard
+      for i, shard in enumerate(array_ops.split(tensor, number_of_shards)):
+        shards[i][name] = shard
     return shards
 
   with ops_lib.name_scope('split_inputs'):
@@ -322,17 +313,7 @@ def _call_optimizer_fn(optimizer_fn, params):
 
 def _compute_sum_on_device(values, device, name=None):
   with ops_lib.device(device):
-    if isinstance(values[0], ops_lib.IndexedSlices):
-      if name:
-        raise ValueError('The name {} is not expected to be given to '
-                         'IndexedSlices {}'.format(name, values))
-
-      values_concat = array_ops.concat([v.values for v in values], axis=0)
-      indices_concat = array_ops.concat([v.indices for v in values], axis=0)
-      return ops_lib.IndexedSlices(values_concat, indices_concat,
-                                   values[0].dense_shape)
-    else:
-      return math_ops.add_n(values, name=name)
+    return math_ops.add_n(values, name=name)
 
 
 def _train_spec(tower_specs,
@@ -357,17 +338,25 @@ def _eval_spec(tower_specs, aggregation_device, aggregated_loss_name='loss'):
       [spec.loss for spec in tower_specs], aggregation_device,
       aggregated_loss_name)
 
-  update_ops = []
+  eval_metric_ops_lists = {}
   for tower_spec in tower_specs:
-    for name, (_, update_op) in six.iteritems(tower_spec.eval_metric_ops):
+    metrics = tower_spec.eval_metric_ops or {}
+    for name, (_, update_op) in six.iteritems(metrics):
+      update_ops = eval_metric_ops_lists.setdefault(name, ([]))
       update_ops.append(update_op)
 
-  with ops_lib.control_dependencies(update_ops):
-    reduced_update_op = _reduce_metric_variables(len(tower_specs))
-
   eval_metric_ops = {}
   for name, (metric_tensor, _) in six.iteritems(tower_specs[0].eval_metric_ops):
+    with ops_lib.control_dependencies(eval_metric_ops_lists[name]):
+      # This operation reduces local variables across all metrics, yet is
+      # called for every metric.  This is redundant and it's done because
+      # it is hard to know what local variables correspond to what metric.
+      # Estimator is going to execute all `reduced_update_op`s as part of
+      # a group inside a single `Session.run()` call, which will avoid duplicate
+      # computation.
+      reduced_update_op = _reduce_metric_variables(len(tower_specs))
     eval_metric_ops[name] = (metric_tensor, reduced_update_op)
+
   estimator_spec['eval_metric_ops'] = eval_metric_ops
   return model_fn_lib.EstimatorSpec(**estimator_spec)
 
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
index 21d5a9c327..10b47fba5a 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
@@ -65,35 +65,20 @@ class DNNClassifierIntegrationTest(test_util.TensorFlowTestCase):
     data = np.linspace(
         0., n_classes - 1., batch_size * input_dimension, dtype=np.float32)
     x_data = data.reshape(batch_size, input_dimension)
-    categorical_data = np.random.random_integers(
-        0, len(x_data), size=len(x_data))
     y_data = np.reshape(self._as_label(data[:batch_size]), (batch_size, 1))
     train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': x_data,
-           'categories': categorical_data},
+        x={'x': x_data},
         y=y_data,
         batch_size=batch_size,
         num_epochs=None,
         shuffle=True)
     eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': x_data,
-           'categories': categorical_data},
-        y=y_data,
-        batch_size=batch_size,
-        shuffle=False)
+        x={'x': x_data}, y=y_data, batch_size=batch_size, shuffle=False)
     predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': x_data,
-           'categories': categorical_data},
-        batch_size=batch_size,
-        shuffle=False)
+        x={'x': x_data}, batch_size=batch_size, shuffle=False)
 
     feature_columns = [
-        feature_column.numeric_column('x', shape=(input_dimension,)),
-        feature_column.indicator_column(
-            feature_column.categorical_column_with_vocabulary_list(
-                'categories',
-                vocabulary_list=np.linspace(
-                    0., len(x_data), len(x_data), dtype=np.int64)))
+        feature_column.numeric_column('x', shape=(input_dimension,))
     ]
 
     estimator = dnn.DNNClassifier(
@@ -105,11 +90,14 @@ class DNNClassifierIntegrationTest(test_util.TensorFlowTestCase):
     def optimizer_fn():
       return optimizers.get_optimizer_instance('Adagrad', learning_rate=0.05)
 
+    # TODO(isaprykin):  Switch Estimator to use allow_soft_placement=True
+    # during export_savedmodel and then switch this test to replicate over
+    # GPUs instead of CPUs.
     estimator = estimator_lib.Estimator(
         model_fn=replicate_model_fn.replicate_model_fn(
             estimator.model_fn,
             optimizer_fn,
-            devices=['/gpu:0', '/gpu:1', '/gpu:2']),
+            devices=['/cpu:0', '/cpu:0', '/cpu:0']),
         model_dir=estimator.model_dir,
         config=estimator.config,
         params=estimator.params)
@@ -242,7 +230,6 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
       accuracy = session.run(accuracy)
       auc = session.run(auc)
 
-      # loss[i] = features[i] * 10 - labels[i].
       # Accuracy is 0.0 (no match) in the first tower.
       # Accuracy is 1.0 (match) in the second tower, since the feature
       # times weight "c" happened to be equal to the label.
@@ -544,7 +531,8 @@ class EvalSpecTest(test_util.TensorFlowTestCase):
       self.assertEqual('/device:CPU:0', auc.device)
 
       session.run([a, b])
-      accuracy, auc = session.run([accuracy, auc])
+      accuracy = session.run(accuracy)
+      auc = session.run(auc)
 
       self.assertNear((12 - 2) / 12, accuracy, 0.01)
       self.assertEqual(0, auc)
@@ -873,7 +861,7 @@ class LocalDeviceSetterTest(test_util.TensorFlowTestCase):
 
 class ComputeSumWithDevicePlacementTest(test_util.TensorFlowTestCase):
 
-  def test_vectors(self):
+  def test_example(self):
     with self.test_session() as session:
       total = replicate_model_fn._compute_sum_on_device(
           [1.0, 2.0, 3.0, 4.0], device='/device:GPU:0', name='test_sum')
@@ -882,68 +870,6 @@ class ComputeSumWithDevicePlacementTest(test_util.TensorFlowTestCase):
       self.assertEqual('test_sum', total.op.name)
       self.assertEqual(10.0, session.run(total))
 
-  def test_tensors(self):
-    with self.test_session() as session:
-      total = replicate_model_fn._compute_sum_on_device(
-          [[1.0, 2.0], [3.0, 4.0]], device='/device:GPU:0', name='test_sum')
-
-      self.assertEqual('/device:GPU:0', total.device)
-      self.assertEqual('test_sum', total.op.name)
-      self.assertAllEqual([4.0, 6.0], session.run(total))
-
-  def test_indexedslices(self):
-    with self.test_session() as session:
-      a = ops_lib.IndexedSlices(
-          constant_op.constant([1.0, 2.0]), [0, 1],
-          dense_shape=constant_op.constant([2]))
-      b = ops_lib.IndexedSlices(constant_op.constant([3.0, 4.0]), [0, 1])
-
-      total = replicate_model_fn._compute_sum_on_device(
-          [a, b], device='/device:GPU:0')
-
-      self.assertEqual('/device:GPU:0', total.device)
-      self.assertAllEqual([4.0, 6.0],
-                          session.run(ops_lib.convert_to_tensor(total)))
-
-  def test_indexedslices_higher_dimensions(self):
-    with self.test_session() as session:
-      a = ops_lib.IndexedSlices(
-          constant_op.constant([[1.0, 5.0], [2.0, 6.0]]), [0, 1],
-          dense_shape=constant_op.constant([2, 4]))
-      b = ops_lib.IndexedSlices(
-          constant_op.constant([[3.0, 7.0], [4.0, 8.0]]), [0, 1])
-
-      total = replicate_model_fn._compute_sum_on_device(
-          [a, b], device='/device:GPU:0')
-
-      self.assertEqual('/device:GPU:0', total.device)
-      self.assertAllEqual([[4.0, 12.0], [6.0, 14.0]],
-                          session.run(ops_lib.convert_to_tensor(total)))
-
-  def test_indexedslices_some_dont_overlap(self):
-    with self.test_session() as session:
-      a = ops_lib.IndexedSlices(
-          constant_op.constant([1.0, 2.0]), [0, 3],
-          dense_shape=constant_op.constant([4]))
-      b = ops_lib.IndexedSlices(constant_op.constant([3.0, 4.0]), [0, 1])
-
-      total = replicate_model_fn._compute_sum_on_device(
-          [a, b], device='/device:GPU:0')
-
-      self.assertEqual('/device:GPU:0', total.device)
-      self.assertAllEqual([4.0, 4.0, 0.0, 2.0],
-                          session.run(ops_lib.convert_to_tensor(total)))
-
-  def test_no_name_for_indexslices(self):
-    a = ops_lib.IndexedSlices(
-        constant_op.constant([1.0, 2.0]), [0, 1],
-        dense_shape=constant_op.constant([2]))
-    b = ops_lib.IndexedSlices(constant_op.constant([3.0, 4.0]), [0, 1])
-
-    with self.assertRaisesRegexp(ValueError, ''):
-      _ = replicate_model_fn._compute_sum_on_device(
-          [a, b], device='/device:GPU:0', name='cant_name_indexslices')
-
 
 class ConcatTensorDictsTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
index 0824ecf616..e89993991a 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
@@ -76,7 +76,7 @@ class GANEstimator(estimator.Estimator):
         return logits
 
       # Create GAN estimator.
-      gan_estimator = tfgan.estimator.GANEstimator(
+      gan_estimator = estimator.GANEstimator(
           model_dir,
           generator_fn=generator_fn,
           discriminator_fn=discriminator_fn,
diff --git a/tensorflow/contrib/kfac/python/ops/layer_collection.py b/tensorflow/contrib/kfac/python/ops/layer_collection.py
index 7300a7998c..4eabb59b3e 100644
--- a/tensorflow/contrib/kfac/python/ops/layer_collection.py
+++ b/tensorflow/contrib/kfac/python/ops/layer_collection.py
@@ -448,10 +448,10 @@ class LayerCollection(object):
         tf.get_variable_scope().reuse.
 
     Raises:
-      ValueError: If reuse == True and name == None.
-      ValueError: If reuse == True and seed != None.
-      KeyError: If reuse == True and no existing LossFunction with 'name' found.
-      KeyError: If reuse == False and existing LossFunction with 'name' found.
+      ValueError: If reuse=True and name != None.
+      ValueError: If reuse=True and seed != None.
+      KeyError: If reuse=True and no existing LossFunction with 'name' found.
+      KeyError: If reuse=False and existing LossFunction with 'name' found.
     """
     name = name or self._graph.unique_name(
         "register_categorical_predictive_distribution")
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index ad4a0b302f..dab5a5297c 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -286,7 +286,6 @@ def _fused_batch_norm(inputs,
     ValueError: If the rank of `inputs` is neither 2 or 4.
     ValueError: If rank or `C` dimension of `inputs` is undefined.
   """
-  # TODO(reedwm): Add support for fp16 inputs.
   if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
     raise ValueError('data_format has to be either NCHW or NHWC.')
   with variable_scope.variable_scope(
@@ -320,9 +319,10 @@ def _fused_batch_norm(inputs,
                        (inputs.name, params_shape))
 
     # Allocate parameters for the beta and gamma of the normalization.
-    trainable_beta = trainable and center
     beta_collections = utils.get_variable_collections(variables_collections,
                                                       'beta')
+    # Float32 required to avoid precision-loss when using fp16 input/output
+    variable_dtype = dtypes.float32
     if not param_initializers:
       param_initializers = {}
     if not param_regularizers:
@@ -336,13 +336,13 @@ def _fused_batch_norm(inputs,
       beta = variables.model_variable(
           'beta',
           shape=params_shape,
-          dtype=dtype,
+          dtype=variable_dtype,
           initializer=beta_initializer,
           regularizer=beta_regularizer,
           collections=beta_collections,
-          trainable=trainable_beta)
+          trainable=trainable)
     else:
-      beta = array_ops.constant(0.0, shape=params_shape)
+      beta = array_ops.constant(0.0, dtype=variable_dtype, shape=params_shape)
 
     if scale:
       gamma_collections = utils.get_variable_collections(
@@ -352,13 +352,13 @@ def _fused_batch_norm(inputs,
       gamma = variables.model_variable(
           'gamma',
           shape=params_shape,
-          dtype=dtype,
+          dtype=variable_dtype,
           initializer=gamma_initializer,
           regularizer=gamma_regularizer,
           collections=gamma_collections,
           trainable=trainable)
     else:
-      gamma = array_ops.constant(1.0, shape=params_shape)
+      gamma = array_ops.constant(1.0, dtype=variable_dtype, shape=params_shape)
 
     # Create moving_mean and moving_variance variables and add them to the
     # appropriate collections. We disable variable partitioning while creating
@@ -375,7 +375,7 @@ def _fused_batch_norm(inputs,
       moving_mean = variables.model_variable(
           'moving_mean',
           shape=params_shape,
-          dtype=dtype,
+          dtype=variable_dtype,
           initializer=moving_mean_initializer,
           trainable=False,
           collections=moving_mean_collections)
@@ -386,7 +386,7 @@ def _fused_batch_norm(inputs,
       moving_variance = variables.model_variable(
           'moving_variance',
           shape=params_shape,
-          dtype=dtype,
+          dtype=variable_dtype,
           initializer=moving_variance_initializer,
           trainable=False,
           collections=moving_variance_collections)
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 2837a3172d..7ccd9d8868 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -1766,10 +1766,12 @@ class BatchNormTest(test.TestCase):
       with self.assertRaisesRegexp(ValueError, 'undefined'):
         _layers.batch_norm(inputs, data_format='NCHW')
 
-  def _testCreateOp(self, fused):
+  def _testCreateOp(self, fused, dtype=None):
+    if dtype is None:
+      dtype = dtypes.float32
     height, width = 3, 3
     with self.test_session():
-      images = np.random.uniform(size=(5, height, width, 3)).astype('f')
+      images = np.random.uniform(size=(5, height, width, 3)).astype(dtype.as_numpy_dtype)
       output = _layers.batch_norm(images, fused=fused)
       expected_name = ('BatchNorm/FusedBatchNorm' if fused else
                        'BatchNorm/batchnorm')
@@ -1784,6 +1786,9 @@ class BatchNormTest(test.TestCase):
   def testCreateOpFused(self):
     self._testCreateOp(True)
 
+  def testCreateOpFusedFloat16(self):
+    self._testCreateOp(True, dtypes.float16)
+
   def _testCreateOpBetaRegularizer(self, fused=True):
     height, width = 3, 3
     with self.test_session():
@@ -2651,10 +2656,68 @@ class BatchNormTest(test.TestCase):
   def testBatchNormBeta(self):
     # Test case for 11673
     with self.test_session() as sess:
-      a = array_ops.placeholder(dtypes.float32, shape=(10, 10, 10, 10))
-      b = _layers.batch_norm(a, center=False, data_format='NCHW',
-                                       zero_debias_moving_mean=True)
+      a_32 = array_ops.placeholder(dtypes.float32, shape=(10, 10, 10, 10))
+      b_32 = _layers.batch_norm(a_32, center=False, data_format='NCHW',
+                                zero_debias_moving_mean=True)
+      a_16 = array_ops.placeholder(dtypes.float16, shape=(10, 10, 10, 10))
+      b_16 = _layers.batch_norm(a_16, center=False, data_format='NCHW',
+                                zero_debias_moving_mean=True)
+      sess.run(variables_lib.global_variables_initializer())
+
+  def testVariablesAreFloat32(self):
+    height, width = 3, 3
+    with self.test_session():
+      images = random_ops.random_uniform((5, height, width, 3),
+                                         seed=1, dtype=dtypes.float16)
+      _layers.batch_norm(images, scale=True)
+      beta = variables.get_variables_by_name('beta')[0]
+      gamma = variables.get_variables_by_name('gamma')[0]
+      self.assertEqual(beta.dtype, dtypes.float32_ref)
+      self.assertEqual(gamma.dtype, dtypes.float32_ref)
+      moving_mean = variables.get_variables_by_name('moving_mean')[0]
+      moving_variance = variables.get_variables_by_name('moving_variance')[0]
+      self.assertEqual(moving_mean.dtype, dtypes.float32_ref)
+      self.assertEqual(moving_variance.dtype, dtypes.float32_ref)
+
+  def _runFusedBatchNorm(self, shape, dtype):
+    channels = shape[1]
+    images = np.arange(np.product(shape), dtype=dtype).reshape(shape)
+    beta = init_ops.constant_initializer(
+        np.arange(
+            2, channels + 2, dtype=np.float32))
+    gamma = init_ops.constant_initializer(
+        np.arange(
+            10, channels + 10, dtype=np.float32) * 2.0)
+    mean = init_ops.constant_initializer(
+        np.arange(
+            3, channels + 3, dtype=np.float32) * 5.0)
+    variance = init_ops.constant_initializer(
+        np.arange(
+            1, channels + 1, dtype=np.float32) * 4.0)
+    output = _layers.batch_norm(
+        images,
+        fused=True,
+        is_training=True,
+        scale=True,
+        epsilon=0.5,
+        param_initializers={
+            'beta': beta,
+            'gamma': gamma,
+            'moving_mean': mean,
+            'moving_variance': variance,
+        },
+        data_format='NCHW')
+    with self.test_session(use_gpu=True) as sess:
       sess.run(variables_lib.global_variables_initializer())
+      return sess.run(output)
+
+  def testFusedBatchNormFloat16MatchesFloat32(self):
+    if test.is_gpu_available(cuda_only=True):
+      shape = [5, 4, 2, 3]
+      res_32 = self._runFusedBatchNorm(shape, np.float32)
+      res_16 = self._runFusedBatchNorm(shape, np.float16)
+      self.assertAllClose(res_32, res_16, rtol=1e-3)
+
 
   def testAdjustmentCreated(self):
     # Tests that the adjustment is appropriately passed to and used by the core
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index 468d792a0d..bc0e6fc009 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -119,7 +119,7 @@ class Head(object):
       update_op = tf.contrib.layers.optimize_loss(optimizer=sync,
                                                   loss=model_fn_ops.loss, ...)
       hooks = [sync.make_session_run_hook(is_chief)]
-      ... upate train_op and hooks in ModelFnOps and return
+      ... update train_op and hooks in ModelFnOps and return
     ```
   """
   __metaclass__ = abc.ABCMeta
diff --git a/tensorflow/contrib/learn/python/learn/estimators/model_fn.py b/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
index 8be9c72adf..44e6c7c52d 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
@@ -23,7 +23,6 @@ import collections
 
 import six
 
-from tensorflow.contrib import framework as contrib_framework
 from tensorflow.contrib.framework import get_graph_from_inputs
 from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import metric_key
@@ -32,6 +31,7 @@ from tensorflow.python.estimator import model_fn as core_model_fn_lib
 from tensorflow.python.estimator.export import export_output as core_export_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
@@ -156,11 +156,11 @@ class ModelFnOps(
     else:
       if isinstance(predictions, dict):
         predictions = {
-            k: contrib_framework.convert_to_tensor_or_sparse_tensor(v)
+            k: sparse_tensor.convert_to_tensor_or_sparse_tensor(v)
             for k, v in six.iteritems(predictions)
         }
       else:
-        predictions = contrib_framework.convert_to_tensor_or_sparse_tensor(
+        predictions = sparse_tensor.convert_to_tensor_or_sparse_tensor(
             predictions)
 
     # Validate eval_metric_ops
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
index 4c50d40aaa..db18ebf05d 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
@@ -28,13 +28,14 @@ import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 
 # pylint: disable=g-multiple-import,g-bad-import-order
 from .pandas_io import HAS_PANDAS, extract_pandas_data, extract_pandas_matrix, extract_pandas_labels
 from .dask_io import HAS_DASK, extract_dask_data, extract_dask_labels
-
 # pylint: enable=g-multiple-import,g-bad-import-order
 
 
@@ -365,8 +366,13 @@ class DataFeeder(object):
     self.random_state = np.random.RandomState(
         42) if random_state is None else random_state
 
-    num_samples = list(self._x.values())[0].shape[
-        0] if x_is_dict else self._x.shape[0]
+    if x_is_dict:
+      num_samples = list(self._x.values())[0].shape[0]
+    elif tensor_util.is_tensor(self._x):
+      num_samples = self._x.shape[0].value  # shape will be a Dimension, extract an int
+    else:
+      num_samples = self._x.shape[0]
+      
     if self._shuffle:
       self.indices = self.random_state.permutation(num_samples)
     else:
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
index 13f2f0f502..86d8484391 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
@@ -238,10 +238,10 @@ class SdcaModel(object):
     with name_scope('sdca/prediction'):
       sparse_variables = self._convert_n_to_tensor(self._variables[
           'sparse_features_weights'])
-      result = 0.0
+      result_sparse = 0.0
       for sfc, sv in zip(examples['sparse_features'], sparse_variables):
         # TODO(sibyl-Aix6ihai): following does not take care of missing features.
-        result += math_ops.segment_sum(
+        result_sparse += math_ops.segment_sum(
             math_ops.multiply(
                 array_ops.gather(sv, sfc.feature_indices), sfc.feature_values),
             sfc.example_indices)
@@ -249,12 +249,13 @@ class SdcaModel(object):
       dense_variables = self._convert_n_to_tensor(self._variables[
           'dense_features_weights'])
 
+      result_dense = 0.0
       for i in range(len(dense_variables)):
-        result += math_ops.matmul(dense_features[i],
-                                  array_ops.expand_dims(dense_variables[i], -1))
+        result_dense += math_ops.matmul(
+            dense_features[i], array_ops.expand_dims(dense_variables[i], -1))
 
     # Reshaping to allow shape inference at graph construction time.
-    return array_ops.reshape(result, [-1])
+    return array_ops.reshape(result_dense, [-1]) + result_sparse
 
   def predictions(self, examples):
     """Add operations to compute predictions by the model.
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index dba1464653..e2e6c05591 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -314,7 +314,8 @@ ifeq ($(TARGET),ANDROID)
 -Wno-narrowing \
 -fomit-frame-pointer \
 $(MARCH_OPTION) \
--fPIE
+-fPIE \
+-fPIC
 	INCLUDES = \
 -I$(NDK_ROOT)/sources/android/support/include \
 -I$(NDK_ROOT)/sources/cxx-stl/gnu-libstdc++/4.9/include \
diff --git a/tensorflow/contrib/makefile/README.md b/tensorflow/contrib/makefile/README.md
index 715eb51577..65bd60c12a 100644
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@@ -174,10 +174,26 @@ tensorflow/contrib/makefile/build_all_ios.sh
 
 This process will take around twenty minutes on a modern MacBook Pro.
 
-When it completes, you will have a library for a single architecture and the
-benchmark program. Although successfully compiling the benchmark program is a
+When it completes, you will have a unified library for all architectures
+(i386sim, x86_64sim, armv7, armv7s and arm64)  and the benchmark program.
+Although successfully compiling the benchmark program is a
 sign of success, the program is not a complete iOS app.
 
+If you would only like to build only one architecture to save time:
+(iOS 11+ only supports 64bit so you can get away with arm64)
+
+```bash
+tensorflow/contrib/makefile/build_all_ios.sh -a arm64
+```
+
+After the first build if you would like to just build the tensorflow
+library you can pass the -T flag to avoid a clean & rebuild. This should
+take you just a few seconds to generate the library if you modified one file.
+
+```bash
+tensorflow/contrib/makefile/build_all_ios.sh -a arm64 -T
+```
+
 To see TensorFlow running on iOS, the example Xcode project in
 [tensorflow/examples/ios](../../examples/ios/) shows how to use the static
 library in a simple app.
@@ -193,19 +209,18 @@ If you have not already, you will need to download dependencies:
 tensorflow/contrib/makefile/download_dependencies.sh
 ```
 
-Next, you will need to compile protobufs for iOS:
+Next, you will need to compile protobufs for iOS (optionally takes the -a $ARCH flag):
 
 ```bash
-tensorflow/contrib/makefile/compile_ios_protobuf.sh 
+tensorflow/contrib/makefile/compile_ios_protobuf.sh
 ```
 
-Then, you will need to compile the nsync library for iOS:
+Then, you will need to compile the nsync library for iOS (optionally takes -a $ARCH flag):
 
 ```bash
 export HOST_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh`
 export TARGET_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh -t ios`
 ```
-
 Then, you can run the makefile specifying iOS as the target, along with the
 architecture you want to build for:
 
@@ -219,10 +234,6 @@ This creates a library in
 `tensorflow/contrib/makefile/gen/lib/libtensorflow-core.a` that you can link any
 xcode project against. 
 
-At this point, you will have a library for a single architecture and the
-benchmark program. Although successfully compiling the benchmark program is a
-sign of success, the program is not a complete iOS app. 
-
 To see TensorFlow running on iOS, the example Xcode project in
 [tensorflow/examples/ios](../../examples/ios/) shows how to use the static
 library in a simple app.
@@ -237,6 +248,14 @@ time follow it with:
 compile_ios_tensorflow.sh
 ```
 
+`compile_ios_tensorflow.sh` takes the -a flag to build only for one architecture.
+In case you run into issues with unresolved symbols with nsync you can also pass
+-h ${HOST_NSYNC_LIB} and -n {TARGET_NSYNC_LIB} so it would look like:
+
+```bash
+tensorflow/contrib/makefile/compile_ios_tensorflow.sh -f "-O3" -h tensorflow/contrib/makefile/downloads/nsync/builds/default.macos.c++11/nsync.a -n tensorflow/contrib/makefile/downloads/nsync/builds/lipo.ios.c++11/nsync.a -a arm64
+```
+
 In XCode, you will need to use -force_load in the linker flags
 section of the build settings to pull in the global constructors that are used
 to register ops and kernels. 
@@ -249,7 +268,7 @@ debug mode. If you are concerned about performance or are working on a release
 build, you would likely want a higher optimization setting, like so:
  
 ```bash
-compile_ios_tensorflow.sh "-Os"
+compile_ios_tensorflow.sh -f "-Os"
 ```
 
 For other variations of valid optimization flags, see [clang optimization levels](http://stackoverflow.com/questions/15548023/clang-optimization-levels).
diff --git a/tensorflow/contrib/makefile/build_all_ios.sh b/tensorflow/contrib/makefile/build_all_ios.sh
index a49bbe4565..988e12b482 100755
--- a/tensorflow/contrib/makefile/build_all_ios.sh
+++ b/tensorflow/contrib/makefile/build_all_ios.sh
@@ -23,14 +23,29 @@ if [[ $(uname) != "Darwin" ]]; then
     exit 1
 fi
 
+usage() {
+  echo "Usage: $(basename "$0") [-a:T]"
+  echo "-a [build_arch] build only for specified arch x86_64 [default=all]"
+  echo "-T only build tensorflow (dont download other deps etc)"
+  exit 1
+}
+
+while getopts "a:T" opt_name; do
+  case "$opt_name" in
+    a) BUILD_ARCH="${OPTARG}";;
+    T) ONLY_MAKE_TENSORFLOW="true";;
+    *) usage;;
+  esac
+done
+shift $((OPTIND - 1))
+
+
 # Make sure we're in the correct directory, at the root of the source tree.
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd ${SCRIPT_DIR}/../../../
 
-
-# Remove any old files first.
-make -f tensorflow/contrib/makefile/Makefile clean
-rm -rf tensorflow/contrib/makefile/downloads
+source "${SCRIPT_DIR}/build_helper.subr"
+JOB_COUNT="${JOB_COUNT:-$(get_job_count)}"
 
 # Setting a deployment target is required for building with bitcode,
 # otherwise linking will fail with:
@@ -41,20 +56,37 @@ if [[ -n MACOSX_DEPLOYMENT_TARGET ]]; then
     export MACOSX_DEPLOYMENT_TARGET=$(sw_vers -productVersion)
 fi
 
-# Pull down the required versions of the frameworks we need.
-tensorflow/contrib/makefile/download_dependencies.sh
+if [[ "${ONLY_MAKE_TENSORFLOW}" != "true" ]]; then
+    # Remove any old files first.
+    make -f tensorflow/contrib/makefile/Makefile clean
+    rm -rf tensorflow/contrib/makefile/downloads
 
-# Compile protobuf for the target iOS device architectures.
-tensorflow/contrib/makefile/compile_ios_protobuf.sh
+    # Pull down the required versions of the frameworks we need.
+    tensorflow/contrib/makefile/download_dependencies.sh
+
+    # Compile protobuf for the target iOS device architectures.
+    tensorflow/contrib/makefile/compile_ios_protobuf.sh
+fi
 
 # Compile nsync for the target iOS device architectures.
 # Don't use  export var=`something` syntax; it swallows the exit status.
 HOST_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh`
-TARGET_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh -t ios`
+if [[ -z "${BUILD_ARCH}" ]]; then
+    # No arch specified so build all architectures
+    TARGET_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh -t ios`
+else
+    # arch specified so build just that
+    TARGET_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh -t ios -a ${BUILD_ARCH}`
+fi
 export HOST_NSYNC_LIB TARGET_NSYNC_LIB
 
-# Build the iOS TensorFlow libraries.
-tensorflow/contrib/makefile/compile_ios_tensorflow.sh "-O3"
+if [[ -z "${BUILD_ARCH}" ]]; then
+    # build the ios tensorflow libraries.
+    tensorflow/contrib/makefile/compile_ios_tensorflow.sh -f "-O3" -h $HOST_NSYNC_LIB -n $TARGET_NSYNC_LIB
+else
+    # arch specified so build just that
+    tensorflow/contrib/makefile/compile_ios_tensorflow.sh -f "-O3" -a "${BUILD_ARCH}" -h $HOST_NSYNC_LIB -n $TARGET_NSYNC_LIB
+fi
 
 # Creates a static universal library in
 # tensorflow/contrib/makefile/gen/lib/libtensorflow-core.a
diff --git a/tensorflow/contrib/makefile/compile_ios_protobuf.sh b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
index 4056db18a7..43e5809dd2 100755
--- a/tensorflow/contrib/makefile/compile_ios_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
@@ -21,10 +21,28 @@ if [[ -n MACOSX_DEPLOYMENT_TARGET ]]; then
     export MACOSX_DEPLOYMENT_TARGET=$(sw_vers -productVersion)
 fi
 
-SCRIPT_DIR=$(dirname $0)
+usage() {
+  echo "Usage: $(basename "$0") [-a]"
+  echo "-a [build_arch] build for specified arch comma separate for multiple archs (eg: x86_64,arm64)"
+  echo "default arch i386, x86_64, armv7, armv7s, arm64"
+  exit 1
+}
+
+BUILD_TARGET="i386 x86_64 armv7 armv7s arm64"
+while getopts "a:" opt_name; do
+  case "$opt_name" in
+    a) BUILD_TARGET="${OPTARG}";;
+    *) usage;;
+  esac
+done
+shift $((OPTIND - 1))
+
+IFS=' ' read -r -a build_targets <<< "${BUILD_TARGET}"
+
+SCRIPT_DIR=$(cd `dirname $0` && pwd)
 source "${SCRIPT_DIR}/build_helper.subr"
 
-cd tensorflow/contrib/makefile
+cd ${SCRIPT_DIR}
 
 HOST_GENDIR="$(pwd)/gen/protobuf-host"
 mkdir -p "${HOST_GENDIR}"
@@ -64,6 +82,10 @@ else
   echo "protoc found. Skip building host tools."
 fi
 
+# Remove old libs
+rm -f ${LIBDIR}/libprotobuf.a
+rm -f ${LIBDIR}/libprotobuf-lite.a
+
 ./autogen.sh
 if [ $? -ne 0 ]
 then
@@ -71,157 +93,192 @@ then
   exit 1
 fi
 
-make distclean
-./configure \
---host=i386-apple-${OSX_VERSION} \
---disable-shared \
---enable-cross-compile \
---with-protoc="${PROTOC_PATH}" \
---prefix=${LIBDIR}/iossim_386 \
---exec-prefix=${LIBDIR}/iossim_386 \
-"CFLAGS=${CFLAGS} \
--mios-simulator-version-min=${MIN_SDK_VERSION} \
--arch i386 \
--fembed-bitcode \
--isysroot ${IPHONESIMULATOR_SYSROOT}" \
-"CXX=${CXX}" \
-"CXXFLAGS=${CXXFLAGS} \
--mios-simulator-version-min=${MIN_SDK_VERSION} \
--arch i386 \
--fembed-bitcode \
--isysroot \
-${IPHONESIMULATOR_SYSROOT}" \
-LDFLAGS="-arch i386 \
--fembed-bitcode \
--mios-simulator-version-min=${MIN_SDK_VERSION} \
-${LDFLAGS} \
--L${IPHONESIMULATOR_SYSROOT}/usr/lib/ \
--L${IPHONESIMULATOR_SYSROOT}/usr/lib/system" \
-"LIBS=${LIBS}"
-make -j"${JOB_COUNT}"
-make install
-
-make distclean
-./configure \
---host=x86_64-apple-${OSX_VERSION} \
---disable-shared \
---enable-cross-compile \
---with-protoc="${PROTOC_PATH}" \
---prefix=${LIBDIR}/iossim_x86_64 \
---exec-prefix=${LIBDIR}/iossim_x86_64 \
-"CFLAGS=${CFLAGS} \
--mios-simulator-version-min=${MIN_SDK_VERSION} \
--arch x86_64 \
--fembed-bitcode \
--isysroot ${IPHONESIMULATOR_SYSROOT}" \
-"CXX=${CXX}" \
-"CXXFLAGS=${CXXFLAGS} \
--mios-simulator-version-min=${MIN_SDK_VERSION} \
--arch x86_64 \
--fembed-bitcode \
--isysroot \
-${IPHONESIMULATOR_SYSROOT}" \
-LDFLAGS="-arch x86_64 \
--fembed-bitcode \
--mios-simulator-version-min=${MIN_SDK_VERSION} \
-${LDFLAGS} \
--L${IPHONESIMULATOR_SYSROOT}/usr/lib/ \
--L${IPHONESIMULATOR_SYSROOT}/usr/lib/system" \
-"LIBS=${LIBS}"
-make -j"${JOB_COUNT}"
-make install
-
-make distclean
-./configure \
---host=armv7-apple-${OSX_VERSION} \
---with-protoc="${PROTOC_PATH}" \
---disable-shared \
---prefix=${LIBDIR}/ios_arm7 \
---exec-prefix=${LIBDIR}/ios_arm7 \
-"CFLAGS=${CFLAGS} \
--miphoneos-version-min=${MIN_SDK_VERSION} \
--arch armv7 \
--fembed-bitcode \
--isysroot ${IPHONEOS_SYSROOT}" \
-"CXX=${CXX}" \
-"CXXFLAGS=${CXXFLAGS} \
--miphoneos-version-min=${MIN_SDK_VERSION} \
--arch armv7 \
--fembed-bitcode \
--isysroot ${IPHONEOS_SYSROOT}" \
-LDFLAGS="-arch armv7 \
--fembed-bitcode \
--miphoneos-version-min=${MIN_SDK_VERSION} \
-${LDFLAGS}" \
-"LIBS=${LIBS}"
-make -j"${JOB_COUNT}"
-make install
-
-make distclean
-./configure \
---host=armv7s-apple-${OSX_VERSION} \
---with-protoc="${PROTOC_PATH}" \
---disable-shared \
---prefix=${LIBDIR}/ios_arm7s \
---exec-prefix=${LIBDIR}/ios_arm7s \
-"CFLAGS=${CFLAGS} \
--miphoneos-version-min=${MIN_SDK_VERSION} \
--arch armv7s \
--fembed-bitcode \
--isysroot ${IPHONEOS_SYSROOT}" \
-"CXX=${CXX}" \
-"CXXFLAGS=${CXXFLAGS} \
--miphoneos-version-min=${MIN_SDK_VERSION} \
--arch armv7s \
--fembed-bitcode \
--isysroot ${IPHONEOS_SYSROOT}" \
-LDFLAGS="-arch armv7s \
--fembed-bitcode \
--miphoneos-version-min=${MIN_SDK_VERSION} \
-${LDFLAGS}" \
-"LIBS=${LIBS}"
-make -j"${JOB_COUNT}"
-make install
-
-make distclean
-./configure \
---host=arm \
---with-protoc="${PROTOC_PATH}" \
---disable-shared \
---prefix=${LIBDIR}/ios_arm64 \
---exec-prefix=${LIBDIR}/ios_arm64 \
-"CFLAGS=${CFLAGS} \
--miphoneos-version-min=${MIN_SDK_VERSION} \
--arch arm64 \
--fembed-bitcode \
--isysroot ${IPHONEOS_SYSROOT}" \
-"CXXFLAGS=${CXXFLAGS} \
--miphoneos-version-min=${MIN_SDK_VERSION} \
--arch arm64 \
--fembed-bitcode \
--isysroot ${IPHONEOS_SYSROOT}" \
-LDFLAGS="-arch arm64 \
--fembed-bitcode \
--miphoneos-version-min=${MIN_SDK_VERSION} \
-${LDFLAGS}" \
-"LIBS=${LIBS}"
-make -j"${JOB_COUNT}"
-make install
-
-lipo \
-${LIBDIR}/iossim_386/lib/libprotobuf.a \
-${LIBDIR}/iossim_x86_64/lib/libprotobuf.a \
-${LIBDIR}/ios_arm7/lib/libprotobuf.a \
-${LIBDIR}/ios_arm7s/lib/libprotobuf.a \
-${LIBDIR}/ios_arm64/lib/libprotobuf.a \
--create \
--output ${LIBDIR}/libprotobuf.a
-
-lipo \
-${LIBDIR}/iossim_386/lib/libprotobuf-lite.a \
-${LIBDIR}/iossim_x86_64/lib/libprotobuf-lite.a \
-${LIBDIR}/ios_arm7/lib/libprotobuf-lite.a \
-${LIBDIR}/ios_arm7s/lib/libprotobuf-lite.a \
-${LIBDIR}/ios_arm64/lib/libprotobuf-lite.a \
--create \
--output ${LIBDIR}/libprotobuf-lite.a
+package_pb_library() {
+    pb_libs="${LIBDIR}/${1}/lib/libprotobuf.a"
+    if [ -f "${LIBDIR}/libprotobuf.a" ]; then
+        pb_libs="$pb_libs ${LIBDIR}/libprotobuf.a"
+    fi
+    lipo \
+    $pb_libs \
+    -create \
+    -output ${LIBDIR}/libprotobuf.a
+
+    pblite_libs="${LIBDIR}/${1}/lib/libprotobuf-lite.a"
+    if [ -f "${LIBDIR}/libprotobuf-lite.a" ]; then
+        pblite_libs="$pblite_libs ${LIBDIR}/libprotobuf-lite.a"
+    fi
+    lipo \
+    $pblite_libs \
+    -create \
+    -output ${LIBDIR}/libprotobuf-lite.a
+}
+
+build_target() {
+case "$1" in
+    i386)  make distclean
+        ./configure \
+        --host=i386-apple-${OSX_VERSION} \
+        --disable-shared \
+        --enable-cross-compile \
+        --with-protoc="${PROTOC_PATH}" \
+        --prefix=${LIBDIR}/iossim_386 \
+        --exec-prefix=${LIBDIR}/iossim_386 \
+        "CFLAGS=${CFLAGS} \
+        -mios-simulator-version-min=${MIN_SDK_VERSION} \
+        -arch i386 \
+        -fembed-bitcode \
+        -isysroot ${IPHONESIMULATOR_SYSROOT}" \
+        "CXX=${CXX}" \
+        "CXXFLAGS=${CXXFLAGS} \
+        -mios-simulator-version-min=${MIN_SDK_VERSION} \
+        -arch i386 \
+        -fembed-bitcode \
+        -isysroot \
+        ${IPHONESIMULATOR_SYSROOT}" \
+        LDFLAGS="-arch i386 \
+        -fembed-bitcode \
+        -mios-simulator-version-min=${MIN_SDK_VERSION} \
+        ${LDFLAGS} \
+        -L${IPHONESIMULATOR_SYSROOT}/usr/lib/ \
+        -L${IPHONESIMULATOR_SYSROOT}/usr/lib/system" \
+        "LIBS=${LIBS}"
+        make -j"${JOB_COUNT}"
+        make install
+
+        package_pb_library "iossim_386"
+        ;;
+
+    x86_64) make distclean
+        ./configure \
+        --host=x86_64-apple-${OSX_VERSION} \
+        --disable-shared \
+        --enable-cross-compile \
+        --with-protoc="${PROTOC_PATH}" \
+        --prefix=${LIBDIR}/iossim_x86_64 \
+        --exec-prefix=${LIBDIR}/iossim_x86_64 \
+        "CFLAGS=${CFLAGS} \
+        -mios-simulator-version-min=${MIN_SDK_VERSION} \
+        -arch x86_64 \
+        -fembed-bitcode \
+        -isysroot ${IPHONESIMULATOR_SYSROOT}" \
+        "CXX=${CXX}" \
+        "CXXFLAGS=${CXXFLAGS} \
+        -mios-simulator-version-min=${MIN_SDK_VERSION} \
+        -arch x86_64 \
+        -fembed-bitcode \
+        -isysroot \
+        ${IPHONESIMULATOR_SYSROOT}" \
+        LDFLAGS="-arch x86_64 \
+        -fembed-bitcode \
+        -mios-simulator-version-min=${MIN_SDK_VERSION} \
+        ${LDFLAGS} \
+        -L${IPHONESIMULATOR_SYSROOT}/usr/lib/ \
+        -L${IPHONESIMULATOR_SYSROOT}/usr/lib/system" \
+        "LIBS=${LIBS}"
+        make -j"${JOB_COUNT}"
+        make install
+
+        package_pb_library "iossim_x86_64"
+        ;;
+
+    armv7) make distclean
+        ./configure \
+        --host=armv7-apple-${OSX_VERSION} \
+        --with-protoc="${PROTOC_PATH}" \
+        --disable-shared \
+        --prefix=${LIBDIR}/ios_arm7 \
+        --exec-prefix=${LIBDIR}/ios_arm7 \
+        "CFLAGS=${CFLAGS} \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        -arch armv7 \
+        -fembed-bitcode \
+        -isysroot ${IPHONEOS_SYSROOT}" \
+        "CXX=${CXX}" \
+        "CXXFLAGS=${CXXFLAGS} \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        -arch armv7 \
+        -fembed-bitcode \
+        -isysroot ${IPHONEOS_SYSROOT}" \
+        LDFLAGS="-arch armv7 \
+        -fembed-bitcode \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        ${LDFLAGS}" \
+        "LIBS=${LIBS}"
+        make -j"${JOB_COUNT}"
+        make install
+
+        package_pb_library "ios_arm7"
+        ;;
+
+    armv7s) make distclean
+        ./configure \
+        --host=armv7s-apple-${OSX_VERSION} \
+        --with-protoc="${PROTOC_PATH}" \
+        --disable-shared \
+        --prefix=${LIBDIR}/ios_arm7s \
+        --exec-prefix=${LIBDIR}/ios_arm7s \
+        "CFLAGS=${CFLAGS} \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        -arch armv7s \
+        -fembed-bitcode \
+        -isysroot ${IPHONEOS_SYSROOT}" \
+        "CXX=${CXX}" \
+        "CXXFLAGS=${CXXFLAGS} \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        -arch armv7s \
+        -fembed-bitcode \
+        -isysroot ${IPHONEOS_SYSROOT}" \
+        LDFLAGS="-arch armv7s \
+        -fembed-bitcode \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        ${LDFLAGS}" \
+        "LIBS=${LIBS}"
+        make -j"${JOB_COUNT}"
+        make install
+
+        package_pb_library "ios_arm7s"
+        ;;
+
+    arm64) make distclean
+        ./configure \
+        --host=arm \
+        --with-protoc="${PROTOC_PATH}" \
+        --disable-shared \
+        --prefix=${LIBDIR}/ios_arm64 \
+        --exec-prefix=${LIBDIR}/ios_arm64 \
+        "CFLAGS=${CFLAGS} \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        -arch arm64 \
+        -fembed-bitcode \
+        -isysroot ${IPHONEOS_SYSROOT}" \
+        "CXXFLAGS=${CXXFLAGS} \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        -arch arm64 \
+        -fembed-bitcode \
+        -isysroot ${IPHONEOS_SYSROOT}" \
+        LDFLAGS="-arch arm64 \
+        -fembed-bitcode \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        ${LDFLAGS}" \
+        "LIBS=${LIBS}"
+        make -j"${JOB_COUNT}"
+        make install
+
+        package_pb_library "ios_arm64"
+        ;;
+    *)
+        echo "Unknown ARCH"
+        exit 1
+        ;;
+esac 
+}
+
+for build_element in "${build_targets[@]}"
+do
+    echo "$build_element"
+    build_target "$build_element"
+done
+
+file ${LIBDIR}/libprotobuf.a
+file ${LIBDIR}/libprotobuf-lite.a
+echo "Done building and packaging the libraries"
diff --git a/tensorflow/contrib/makefile/compile_ios_tensorflow.sh b/tensorflow/contrib/makefile/compile_ios_tensorflow.sh
index 5d1cc8b375..ae82163e11 100755
--- a/tensorflow/contrib/makefile/compile_ios_tensorflow.sh
+++ b/tensorflow/contrib/makefile/compile_ios_tensorflow.sh
@@ -43,55 +43,124 @@ then
     exit 1
 fi
 
+usage() {
+  echo "Usage: $(basename "$0") [-a]"
+  echo "-a [build_arch] build for specified arch comma separate for multiple archs (eg: x86_64,arm64)"
+  echo "default is [i386, x86_64, armv7, armv7s, arm64]"
+  exit 1
+}
+
+BUILD_TARGET="i386 x86_64 armv7 armv7s arm64"
+while getopts "a:f:h:n:" opt_name; do
+  case "$opt_name" in
+    a) BUILD_TARGET="${OPTARG}";;
+    f) BUILD_OPT="${OPTARG}";;
+    h) NSYNC_HOST="${OPTARG}";;
+    n) NSYNC_TARGET="${OPTARG}";;
+    *) usage;;
+  esac
+done
+shift $((OPTIND - 1))
+
+IFS=' ' read -r -a build_targets <<< "${BUILD_TARGET}"
+
+SCRIPT_DIR=$(cd `dirname $0` && pwd)
+source "${SCRIPT_DIR}/build_helper.subr"
+
+
 GENDIR=tensorflow/contrib/makefile/gen/
 LIBDIR=${GENDIR}lib
 LIB_PREFIX=libtensorflow-core
 
-make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
-TARGET=IOS IOS_ARCH=ARMV7 LIB_NAME=${LIB_PREFIX}-armv7.a OPTFLAGS="$1"
-if [ $? -ne 0 ]
-then
-  echo "armv7 compilation failed."
-  exit 1
-fi
+#remove any old artifacts
+rm -rf ${LIBDIR}/${LIB_PREFIX}.a
 
-make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
-TARGET=IOS IOS_ARCH=ARMV7S LIB_NAME=${LIB_PREFIX}-armv7s.a OPTFLAGS="$1"
-if [ $? -ne 0 ]
-then
-  echo "arm7vs compilation failed."
-  exit 1
-fi
+package_tf_library() {
+    CAP_DIR=`echo $1 | tr 'a-z' 'A-Z'`
+    tf_libs="${LIBDIR}/ios_${CAP_DIR}/${LIB_PREFIX}-${1}.a"
+    if [ -f "${LIBDIR}/${LIB_PREFIX}.a" ]; then
+        tf_libs="$tf_libs ${LIBDIR}/${LIB_PREFIX}.a"
+    fi
+    lipo \
+    $tf_libs \
+    -create \
+    -output ${LIBDIR}/${LIB_PREFIX}.a
+}
 
-make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
-TARGET=IOS IOS_ARCH=ARM64 LIB_NAME=${LIB_PREFIX}-arm64.a OPTFLAGS="$1"
-if [ $? -ne 0 ]
-then
-  echo "arm64 compilation failed."
-  exit 1
-fi
+build_tf_target() {
+case "$1" in
+    armv7)
+        make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
+        TARGET=IOS IOS_ARCH=ARMV7 LIB_NAME=${LIB_PREFIX}-armv7.a \
+        OPTFLAGS="${BUILD_OPT}" HOST_NSYNC_LIB="${NSYNC_HOST}" \
+        TARGET_NSYNC_LIB="${NSYNC_TARGET}"
+        if [ $? -ne 0 ]
+        then
+          echo "armv7 compilation failed."
+          exit 1
+        fi
+        package_tf_library "armv7"
+        ;;
+    armv7s)
+        make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
+        TARGET=IOS IOS_ARCH=ARMV7S LIB_NAME=${LIB_PREFIX}-armv7s.a \
+        OPTFLAGS="${BUILD_OPT}" HOST_NSYNC_LIB="${NSYNC_HOST}" \
+        TARGET_NSYNC_LIB="${NSYNC_TARGET}"
 
-make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
-TARGET=IOS IOS_ARCH=I386 LIB_NAME=${LIB_PREFIX}-i386.a OPTFLAGS="$1"
-if [ $? -ne 0 ]
-then
-  echo "i386 compilation failed."
-  exit 1
-fi
+        if [ $? -ne 0 ]
+        then
+          echo "arm7vs compilation failed."
+          exit 1
+        fi
+        package_tf_library "armv7s"
+        ;;
+    arm64)
+        make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
+        TARGET=IOS IOS_ARCH=ARM64 LIB_NAME=${LIB_PREFIX}-arm64.a \
+        OPTFLAGS="${BUILD_OPT}" HOST_NSYNC_LIB="${NSYNC_HOST}" \
+        TARGET_NSYNC_LIB="${NSYNC_TARGET}"
+        if [ $? -ne 0 ]
+        then
+          echo "arm64 compilation failed."
+          exit 1
+        fi
+        package_tf_library "arm64"
+        ;;
+    i386)
+        make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
+        TARGET=IOS IOS_ARCH=I386 LIB_NAME=${LIB_PREFIX}-i386.a \
+        OPTFLAGS="${BUILD_OPT}" HOST_NSYNC_LIB="${NSYNC_HOST}" \
+        TARGET_NSYNC_LIB="${NSYNC_TARGET}"
+        if [ $? -ne 0 ]
+        then
+          echo "i386 compilation failed."
+          exit 1
+        fi
+        package_tf_library "i386"
+        ;;
+    x86_64)
+        make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
+        TARGET=IOS IOS_ARCH=X86_64 LIB_NAME=${LIB_PREFIX}-x86_64.a \
+        OPTFLAGS="${BUILD_OPT}" HOST_NSYNC_LIB="${NSYNC_HOST}" \
+        TARGET_NSYNC_LIB="${NSYNC_TARGET}"
+        if [ $? -ne 0 ]
+        then
+          echo "x86_64 compilation failed."
+          exit 1
+        fi
+        package_tf_library "x86_64"
+        ;;
+    *)
+        echo "Unknown ARCH"
+        exit 1
+esac
+}
 
-make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
-TARGET=IOS IOS_ARCH=X86_64 LIB_NAME=${LIB_PREFIX}-x86_64.a OPTFLAGS="$1"
-if [ $? -ne 0 ]
-then
-  echo "x86_64 compilation failed."
-  exit 1
-fi
+for build_tf_element in "${build_targets[@]}"
+do
+    echo "$build_tf_element"
+    build_tf_target "$build_tf_element"
+done
 
-lipo \
-${LIBDIR}/ios_ARMV7/${LIB_PREFIX}-armv7.a \
-${LIBDIR}/ios_ARMV7S/${LIB_PREFIX}-armv7s.a \
-${LIBDIR}/ios_ARM64/${LIB_PREFIX}-arm64.a \
-${LIBDIR}/ios_I386/${LIB_PREFIX}-i386.a \
-${LIBDIR}/ios_X86_64/${LIB_PREFIX}-x86_64.a \
--create \
--output ${LIBDIR}/${LIB_PREFIX}.a
+echo "Done building and packaging TF"
+file ${LIBDIR}/${LIB_PREFIX}.a
diff --git a/tensorflow/contrib/makefile/compile_nsync.sh b/tensorflow/contrib/makefile/compile_nsync.sh
index ecbd9bb825..930e6b8dea 100755
--- a/tensorflow/contrib/makefile/compile_nsync.sh
+++ b/tensorflow/contrib/makefile/compile_nsync.sh
@@ -265,7 +265,7 @@ for arch in $archs; do
                                           -I$(NDK_ROOT)/sources/cxx-stl/gnu-libstdc++/4.9/libs/'"$arch"'/include \
                                           -I../../platform/c++11 -I../../platform/gcc \
                                           -I../../platform/posix -pthread
-                        PLATFORM_CFLAGS=-std=c++11 -Wno-narrowing '"$march_option"' -fPIE
+                        PLATFORM_CFLAGS=-std=c++11 -Wno-narrowing '"$march_option"' -fPIE -fPIC
                         PLATFORM_LDFLAGS=-pthread
                         MKDEP=${CC} -M -std=c++11
                         PLATFORM_C=../../platform/c++11/src/nsync_semaphore_mutex.cc \
@@ -301,6 +301,9 @@ done
 
 case "$target_platform" in
 ios)    nsync_platform_dir="$nsync_builds_dir/lipo.$target_platform.c++11"
+        if [ -d "$nsync_platform_dir" ]; then
+            rm -rf "$nsync_platform_dir"
+        fi
         mkdir "$nsync_platform_dir"
         eval lipo $platform_libs -create -output '$nsync_platform_dir/nsync.a'
         echo "$nsync_platform_dir/nsync.a"
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index 5f06106c1d..8b77c99cb5 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -8,7 +8,6 @@ tensorflow/core/kernels/xent_op.cc
 tensorflow/core/kernels/where_op.cc
 tensorflow/core/kernels/variable_ops.cc
 tensorflow/core/kernels/unpack_op.cc
-tensorflow/core/kernels/unique_op.cc
 tensorflow/core/kernels/transpose_op.cc
 tensorflow/core/kernels/transpose_functor_cpu.cc
 tensorflow/core/kernels/training_op_helpers.cc
@@ -42,9 +41,6 @@ tensorflow/core/kernels/spectrogram_op.cc
 tensorflow/core/kernels/spectrogram.cc
 tensorflow/core/kernels/sparse_to_dense_op.cc
 tensorflow/core/kernels/sparse_matmul_op.cc
-tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
-tensorflow/core/kernels/sparse_reshape_op.c
-tensorflow/core/kernels/segment_reduction_ops.cc
 tensorflow/core/kernels/softsign_op.cc
 tensorflow/core/kernels/softplus_op.cc
 tensorflow/core/kernels/softmax_op.cc
@@ -113,10 +109,6 @@ tensorflow/core/kernels/maxpooling_op.cc
 tensorflow/core/kernels/matmul_op.cc
 tensorflow/core/kernels/lrn_op.cc
 tensorflow/core/kernels/logging_ops.cc
-tensorflow/core/kernels/initializable_lookup_table.c
-tensorflow/core/kernels/lookup_table_init_op.cc
-tensorflow/core/kernels/lookup_table_op.cc
-tensorflow/core/kernels/lookup_util.cc
 tensorflow/core/kernels/inplace_ops.cc
 tensorflow/core/kernels/in_topk_op.cc
 tensorflow/core/kernels/immutable_constant_op.cc
@@ -124,18 +116,10 @@ tensorflow/core/kernels/identity_op.cc
 tensorflow/core/kernels/identity_n_op.cc
 tensorflow/core/kernels/gather_op.cc
 tensorflow/core/kernels/gather_functor.cc
-tensorflow/core/kernels/gather_nd_op.cc
-tensorflow/core/kernels/gather_nd_op_cpu_impl_0.cc
-tensorflow/core/kernels/gather_nd_op_cpu_impl_1.cc
-tensorflow/core/kernels/gather_nd_op_cpu_impl_2.cc
-tensorflow/core/kernels/gather_nd_op_cpu_impl_3.cc
-tensorflow/core/kernels/gather_nd_op_cpu_impl_4.cc
-tensorflow/core/kernels/gather_nd_op_cpu_impl_5.cc
 tensorflow/core/kernels/fused_batch_norm_op.cc
 tensorflow/core/kernels/function_ops.cc
 tensorflow/core/kernels/fill_functor.cc
 tensorflow/core/kernels/fifo_queue.cc
-tensorflow/core/kernels/fifo_queue_op.cc
 tensorflow/core/kernels/fake_quant_ops.cc
 tensorflow/core/kernels/example_parsing_ops.cc
 tensorflow/core/kernels/encode_wav_op.cc
@@ -182,8 +166,6 @@ tensorflow/core/kernels/cwise_op_floor.cc
 tensorflow/core/kernels/cwise_op_exp.cc
 tensorflow/core/kernels/cwise_op_equal_to_2.cc
 tensorflow/core/kernels/cwise_op_equal_to_1.cc
-tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
-tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
 tensorflow/core/kernels/cwise_op_div.cc
 tensorflow/core/kernels/cwise_op_bitwise_xor.cc
 tensorflow/core/kernels/cwise_op_bitwise_or.cc
diff --git a/tensorflow/contrib/metrics/__init__.py b/tensorflow/contrib/metrics/__init__.py
index 8eed45c4b3..302042c4dd 100644
--- a/tensorflow/contrib/metrics/__init__.py
+++ b/tensorflow/contrib/metrics/__init__.py
@@ -27,7 +27,6 @@ See the @{$python/contrib.metrics} guide.
 @@streaming_false_negative_rate
 @@streaming_false_negative_rate_at_thresholds
 @@streaming_auc
-@@streaming_dynamic_auc
 @@streaming_curve_points
 @@streaming_recall_at_k
 @@streaming_mean_absolute_error
@@ -89,7 +88,6 @@ from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_auc
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_concat
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_covariance
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_curve_points
-from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_dynamic_auc
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_false_negative_rate
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_false_negative_rate_at_thresholds
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_false_negatives
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 24692ff12f..3dd1f1a627 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -1178,154 +1178,6 @@ def streaming_auc(predictions,
       name=name)
 
 
-def _compute_dynamic_auc(labels, predictions, curve='ROC'):
-  """Computes the apporixmate AUC by a Riemann sum with data-derived thresholds.
-
-  Computes the area under the ROC or PR curve using each prediction as a
-  threshold. This could be slow for large batches, but has the advantage of not
-  having its results degrade depending on the distribution of predictions.
-
-  Args:
-    labels: A `Tensor` of ground truth labels with the same shape as
-      `predictions` with values of 0 or 1 and type `int64`.
-    predictions: A 1-D `Tensor` of predictions whose values are `float64`.
-    curve: The name of the curve to be computed, 'ROC' for the Receiving
-      Operating Characteristic or 'PR' for the Precision-Recall curve.
-
-  Returns:
-    A scalar `Tensor` containing the area-under-curve value for the input.
-  """
-  # Count the total number of positive and negative labels in the input.
-  size = array_ops.size(predictions)
-  total_positive = math_ops.cast(math_ops.reduce_sum(labels), dtypes.int32)
-
-  def continue_computing_dynamic_auc():
-    """Continues dynamic auc computation, entered if labels are not all equal.
-
-    Returns:
-      A scalar `Tensor` containing the area-under-curve value.
-    """
-    # Sort the predictions descending, and the corresponding labels as well.
-    ordered_predictions, indices = nn.top_k(predictions, k=size)
-    ordered_labels = array_ops.gather(labels, indices)
-
-    # Get the counts of the unique ordered predictions.
-    _, _, counts = array_ops.unique_with_counts(ordered_predictions)
-
-    # Compute the indices of the split points between different predictions.
-    splits = math_ops.cast(
-        array_ops.pad(math_ops.cumsum(counts), paddings=[[1, 0]]), dtypes.int32)
-
-    # Count the positives to the left of the split indices.
-    positives = math_ops.cast(
-        array_ops.pad(math_ops.cumsum(ordered_labels), paddings=[[1, 0]]),
-        dtypes.int32)
-    true_positives = array_ops.gather(positives, splits)
-    if curve == 'ROC':
-      # Count the negatives to the left of every split point and the total
-      # number of negatives for computing the FPR.
-      false_positives = math_ops.subtract(splits, true_positives)
-      total_negative = size - total_positive
-      x_axis_values = math_ops.truediv(false_positives, total_negative)
-      y_axis_values = math_ops.truediv(true_positives, total_positive)
-    elif curve == 'PR':
-      x_axis_values = math_ops.truediv(true_positives, total_positive)
-      # For conformance, set precision to 1 when the number of positive
-      # classifications is 0.
-      y_axis_values = array_ops.where(
-          math_ops.greater(splits, 0),
-          math_ops.truediv(true_positives, splits),
-          array_ops.ones_like(true_positives, dtype=dtypes.float64))
-
-    # Calculate trapezoid areas.
-    heights = math_ops.add(y_axis_values[1:], y_axis_values[:-1]) / 2.0
-    widths = math_ops.abs(
-        math_ops.subtract(x_axis_values[1:], x_axis_values[:-1]))
-    return math_ops.reduce_sum(math_ops.multiply(heights, widths))
-
-  # If all the labels are the same, AUC isn't well-defined (but raising an
-  # exception seems excessive) so we return 0, otherwise we finish computing.
-  return control_flow_ops.cond(
-      math_ops.logical_or(
-          math_ops.equal(total_positive, 0),
-          math_ops.equal(total_positive, size)
-      ),
-      true_fn=lambda: array_ops.constant(0, dtypes.float64),
-      false_fn=continue_computing_dynamic_auc)
-
-
-def streaming_dynamic_auc(labels,
-                          predictions,
-                          curve='ROC',
-                          metrics_collections=(),
-                          updates_collections=(),
-                          name=None):
-  """Computes the apporixmate AUC by a Riemann sum with data-derived thresholds.
-
-  USAGE NOTE: this approach requires storing all of the predictions and labels
-  for a single evaluation in memory, so it may not be usable when the evaluation
-  batch size and/or the number of evaluation steps is very large.
-
-  Computes the area under the ROC or PR curve using each prediction as a
-  threshold. This has the advantage of being resilient to the distribution of
-  predictions by aggregating across batches, accumulating labels and predictions
-  and performing the final calculation using all of the concatenated values.
-
-  Args:
-    labels: A `Tensor` of ground truth labels with the same shape as `labels`
-      and with values of 0 or 1 whose values are castable to `int64`.
-    predictions: A `Tensor` of predictions whose values are castable to
-      `float64`. Will be flattened into a 1-D `Tensor`.
-    curve: The name of the curve for which to compute AUC, 'ROC' for the
-      Receiving Operating Characteristic or 'PR' for the Precision-Recall curve.
-    metrics_collections: An optional iterable of collections that `auc` should
-      be added to.
-    updates_collections: An optional iterable of collections that `update_op`
-      should be added to.
-    name: An optional name for the variable_scope that contains the metric
-      variables.
-
-  Returns:
-    auc: A scalar `Tensor` containing the current area-under-curve value.
-    update_op: An operation that concatenates the input labels and predictions
-      to the accumulated values.
-
-  Raises:
-    ValueError: If `labels` and `predictions` have mismatched shapes or if
-      `curve` isn't a recognized curve type.
-  """
-
-  if curve not in ['PR', 'ROC']:
-    raise ValueError('curve must be either ROC or PR, %s unknown' % curve)
-
-  with variable_scope.variable_scope(name, default_name='dynamic_auc'):
-    labels.get_shape().assert_is_compatible_with(predictions.get_shape())
-    predictions = array_ops.reshape(
-        math_ops.cast(predictions, dtypes.float64), [-1])
-    labels = array_ops.reshape(math_ops.cast(labels, dtypes.int64), [-1])
-    with ops.control_dependencies([
-        check_ops.assert_greater_equal(
-            labels,
-            array_ops.zeros_like(labels, dtypes.int64),
-            message='labels must be 0 or 1, at least one is <0'),
-        check_ops.assert_less_equal(
-            labels,
-            array_ops.ones_like(labels, dtypes.int64),
-            message='labels must be 0 or 1, at least one is >1')
-    ]):
-      preds_accum, update_preds = streaming_concat(predictions,
-                                                   name='concat_preds')
-      labels_accum, update_labels = streaming_concat(labels,
-                                                     name='concat_labels')
-      update_op = control_flow_ops.group(update_labels, update_preds)
-      auc = _compute_dynamic_auc(labels_accum, preds_accum, curve=curve)
-      if updates_collections:
-        ops.add_to_collections(updates_collections, update_op)
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, auc)
-      return auc, update_op
-
-
 def streaming_precision_recall_at_equal_thresholds(predictions,
                                                    labels,
                                                    num_thresholds=None,
@@ -3433,7 +3285,6 @@ __all__ = [
     'streaming_accuracy',
     'streaming_auc',
     'streaming_curve_points',
-    'streaming_dynamic_auc',
     'streaming_false_negative_rate',
     'streaming_false_negative_rate_at_thresholds',
     'streaming_false_negatives',
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index 5d0463e1f7..6a8e58b4da 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -1708,34 +1708,6 @@ class StreamingCurvePointsTest(test.TestCase):
                    [[1.0, 4.0 / 6.0], [0.75, 1.0], [0.0, 1.0]])
 
 
-def _np_auc(predictions, labels, weights=None):
-  """Computes the AUC explicitly using Numpy.
-
-  Args:
-    predictions: an ndarray with shape [N].
-    labels: an ndarray with shape [N].
-    weights: an ndarray with shape [N].
-
-  Returns:
-    the area under the ROC curve.
-  """
-  if weights is None:
-    weights = np.ones(np.size(predictions))
-  is_positive = labels > 0
-  num_positives = np.sum(weights[is_positive])
-  num_negatives = np.sum(weights[~is_positive])
-
-  # Sort descending:
-  inds = np.argsort(-predictions)
-
-  sorted_labels = labels[inds]
-  sorted_weights = weights[inds]
-  is_positive = sorted_labels > 0
-
-  tp = np.cumsum(sorted_weights * is_positive) / num_positives
-  return np.sum((sorted_weights * tp)[~is_positive]) / num_negatives
-
-
 class StreamingAUCTest(test.TestCase):
 
   def setUp(self):
@@ -1924,6 +1896,33 @@ class StreamingAUCTest(test.TestCase):
 
       self.assertAlmostEqual(1, auc.eval(), 6)
 
+  def np_auc(self, predictions, labels, weights):
+    """Computes the AUC explicitly using Numpy.
+
+    Args:
+      predictions: an ndarray with shape [N].
+      labels: an ndarray with shape [N].
+      weights: an ndarray with shape [N].
+
+    Returns:
+      the area under the ROC curve.
+    """
+    if weights is None:
+      weights = np.ones(np.size(predictions))
+    is_positive = labels > 0
+    num_positives = np.sum(weights[is_positive])
+    num_negatives = np.sum(weights[~is_positive])
+
+    # Sort descending:
+    inds = np.argsort(-predictions)
+
+    sorted_labels = labels[inds]
+    sorted_weights = weights[inds]
+    is_positive = sorted_labels > 0
+
+    tp = np.cumsum(sorted_weights * is_positive) / num_positives
+    return np.sum((sorted_weights * tp)[~is_positive]) / num_negatives
+
   def testWithMultipleUpdates(self):
     num_samples = 1000
     batch_size = 10
@@ -1946,7 +1945,7 @@ class StreamingAUCTest(test.TestCase):
 
     for weights in (None, np.ones(num_samples), np.random.exponential(
         scale=1.0, size=num_samples)):
-      expected_auc = _np_auc(predictions, labels, weights)
+      expected_auc = self.np_auc(predictions, labels, weights)
 
       with self.test_session() as sess:
         enqueue_ops = [[] for i in range(num_batches)]
@@ -1975,211 +1974,6 @@ class StreamingAUCTest(test.TestCase):
         self.assertAlmostEqual(expected_auc, auc.eval(), 2)
 
 
-class StreamingDynamicAUCTest(test.TestCase):
-
-  def setUp(self):
-    super(StreamingDynamicAUCTest, self).setUp()
-    np.random.seed(1)
-    ops.reset_default_graph()
-
-  def testUnknownCurve(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'curve must be either ROC or PR, TEST_CURVE unknown'):
-      metrics.streaming_dynamic_auc(labels=array_ops.ones((10, 1)),
-                                    predictions=array_ops.ones((10, 1)),
-                                    curve='TEST_CURVE')
-
-  def testVars(self):
-    metrics.streaming_dynamic_auc(
-        labels=array_ops.ones((10, 1)), predictions=array_ops.ones((10, 1)))
-    _assert_metric_variables(self, ['dynamic_auc/concat_labels/array:0',
-                                    'dynamic_auc/concat_labels/size:0',
-                                    'dynamic_auc/concat_preds/array:0',
-                                    'dynamic_auc/concat_preds/size:0'])
-
-  def testMetricsCollection(self):
-    my_collection_name = '__metrics__'
-    auc, _ = metrics.streaming_dynamic_auc(
-        labels=array_ops.ones((10, 1)),
-        predictions=array_ops.ones((10, 1)),
-        metrics_collections=[my_collection_name])
-    self.assertEqual(ops.get_collection(my_collection_name), [auc])
-
-  def testUpdatesCollection(self):
-    my_collection_name = '__updates__'
-    _, update_op = metrics.streaming_dynamic_auc(
-        labels=array_ops.ones((10, 1)),
-        predictions=array_ops.ones((10, 1)),
-        updates_collections=[my_collection_name])
-    self.assertEqual(ops.get_collection(my_collection_name), [update_op])
-
-  def testValueTensorIsIdempotent(self):
-    predictions = random_ops.random_uniform(
-        (10, 3), maxval=1, dtype=dtypes_lib.float32, seed=1)
-    labels = random_ops.random_uniform(
-        (10, 3), maxval=2, dtype=dtypes_lib.int64, seed=2)
-    auc, update_op = metrics.streaming_dynamic_auc(labels, predictions)
-    with self.test_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      # Run several updates.
-      for _ in xrange(10):
-        sess.run(update_op)
-      # Then verify idempotency.
-      initial_auc = auc.eval()
-      for _ in xrange(10):
-        self.assertAlmostEqual(initial_auc, auc.eval(), 5)
-
-  def testAllLabelsOnes(self):
-    with self.test_session() as sess:
-      predictions = constant_op.constant([1., 1., 1.])
-      labels = constant_op.constant([1, 1, 1])
-      auc, update_op = metrics.streaming_dynamic_auc(labels, predictions)
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
-      self.assertEqual(0, auc.eval())
-
-  def testAllLabelsZeros(self):
-    with self.test_session() as sess:
-      predictions = constant_op.constant([1., 1., 1.])
-      labels = constant_op.constant([0, 0, 0])
-      auc, update_op = metrics.streaming_dynamic_auc(labels, predictions)
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
-      self.assertEqual(0, auc.eval())
-
-  def testNonZeroOnePredictions(self):
-    with self.test_session() as sess:
-      predictions = constant_op.constant([2.5, -2.5, 2.5, -2.5],
-                                         dtype=dtypes_lib.float32)
-      labels = constant_op.constant([1, 0, 1, 0])
-      auc, update_op = metrics.streaming_dynamic_auc(labels, predictions)
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
-      self.assertAlmostEqual(auc.eval(), 1.0)
-
-  def testAllCorrect(self):
-    inputs = np.random.randint(0, 2, size=(100, 1))
-    with self.test_session() as sess:
-      predictions = constant_op.constant(inputs)
-      labels = constant_op.constant(inputs)
-      auc, update_op = metrics.streaming_dynamic_auc(labels, predictions)
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
-      self.assertEqual(1, auc.eval())
-
-  def testSomeCorrect(self):
-    with self.test_session() as sess:
-      predictions = constant_op.constant([1, 0, 1, 0])
-      labels = constant_op.constant([0, 1, 1, 0])
-      auc, update_op = metrics.streaming_dynamic_auc(labels, predictions)
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
-      self.assertAlmostEqual(0.5, auc.eval())
-
-  def testAllIncorrect(self):
-    inputs = np.random.randint(0, 2, size=(100, 1))
-    with self.test_session() as sess:
-      predictions = constant_op.constant(inputs, dtype=dtypes_lib.float32)
-      labels = constant_op.constant(1 - inputs, dtype=dtypes_lib.float32)
-      auc, update_op = metrics.streaming_dynamic_auc(labels, predictions)
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
-      self.assertAlmostEqual(0, auc.eval())
-
-  def testExceptionOnIncompatibleShapes(self):
-    with self.test_session() as sess:
-      predictions = array_ops.ones([5])
-      labels = array_ops.zeros([6])
-      with self.assertRaisesRegexp(ValueError, 'Shapes .* are incompatible'):
-        _, update_op = metrics.streaming_dynamic_auc(labels, predictions)
-        sess.run(variables.local_variables_initializer())
-        sess.run(update_op)
-
-  def testExceptionOnGreaterThanOneLabel(self):
-    with self.test_session() as sess:
-      predictions = constant_op.constant([1, 0.5, 0], dtypes_lib.float32)
-      labels = constant_op.constant([2, 1, 0])
-      _, update_op = metrics.streaming_dynamic_auc(labels, predictions)
-      sess.run(variables.local_variables_initializer())
-      with self.assertRaisesRegexp(
-          errors_impl.InvalidArgumentError,
-          '.*labels must be 0 or 1, at least one is >1.*'):
-        sess.run(update_op)
-
-  def testExceptionOnNegativeLabel(self):
-    with self.test_session() as sess:
-      predictions = constant_op.constant([1, 0.5, 0], dtypes_lib.float32)
-      labels = constant_op.constant([1, 0, -1])
-      _, update_op = metrics.streaming_dynamic_auc(labels, predictions)
-      sess.run(variables.local_variables_initializer())
-      with self.assertRaisesRegexp(
-          errors_impl.InvalidArgumentError,
-          '.*labels must be 0 or 1, at least one is <0.*'):
-        sess.run(update_op)
-
-  def testWithMultipleUpdates(self):
-    batch_size = 10
-    num_batches = 100
-    labels = np.array([])
-    predictions = np.array([])
-    tf_labels = variables.Variable(array_ops.ones(batch_size, dtypes_lib.int32),
-                                   collections=[ops.GraphKeys.LOCAL_VARIABLES],
-                                   dtype=dtypes_lib.int32)
-    tf_predictions = variables.Variable(
-        array_ops.ones(batch_size),
-        collections=[ops.GraphKeys.LOCAL_VARIABLES],
-        dtype=dtypes_lib.float32)
-    auc, update_op = metrics.streaming_dynamic_auc(tf_labels, tf_predictions)
-    with self.test_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      for _ in xrange(num_batches):
-        new_labels = np.random.randint(0, 2, size=batch_size)
-        noise = np.random.normal(0.0, scale=0.2, size=batch_size)
-        new_predictions = 0.4 + 0.2 * new_labels + noise
-        labels = np.concatenate([labels, new_labels])
-        predictions = np.concatenate([predictions, new_predictions])
-        sess.run(tf_labels.assign(new_labels))
-        sess.run(tf_predictions.assign(new_predictions))
-        sess.run(update_op)
-        expected_auc = _np_auc(predictions, labels)
-        self.assertAlmostEqual(expected_auc, auc.eval())
-
-  def testAUCPRReverseIncreasingPredictions(self):
-    with self.test_session() as sess:
-      predictions = constant_op.constant(
-          [0.1, 0.4, 0.35, 0.8], dtype=dtypes_lib.float32)
-      labels = constant_op.constant([0, 0, 1, 1])
-      auc, update_op = metrics.streaming_dynamic_auc(
-          labels, predictions, curve='PR')
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
-      self.assertAlmostEqual(0.79166, auc.eval(), delta=1e-5)
-
-  def testAUCPRJumbledPredictions(self):
-    with self.test_session() as sess:
-      predictions = constant_op.constant(
-          [0.1, 0.4, 0.35, 0.8, 0.1, 0.135, 0.81], dtypes_lib.float32)
-      labels = constant_op.constant([0, 0, 1, 0, 1, 0, 1])
-      auc, update_op = metrics.streaming_dynamic_auc(
-          labels, predictions, curve='PR')
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
-      self.assertAlmostEqual(0.610317, auc.eval(), delta=1e-6)
-
-  def testAUCPRPredictionsLessThanHalf(self):
-    with self.test_session() as sess:
-      predictions = constant_op.constant(
-          [0.0, 0.1, 0.2, 0.33, 0.3, 0.4, 0.5],
-          shape=(1, 7),
-          dtype=dtypes_lib.float32)
-      labels = constant_op.constant([0, 0, 0, 0, 1, 1, 1], shape=(1, 7))
-      auc, update_op = metrics.streaming_dynamic_auc(
-          labels, predictions, curve='PR')
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
-      self.assertAlmostEqual(0.90277, auc.eval(), delta=1e-5)
-
-
 class StreamingPrecisionRecallAtEqualThresholdsTest(test.TestCase):
 
   def setUp(self):
diff --git a/tensorflow/contrib/nccl/BUILD b/tensorflow/contrib/nccl/BUILD
index df9dbb457a..ed9fb64b95 100644
--- a/tensorflow/contrib/nccl/BUILD
+++ b/tensorflow/contrib/nccl/BUILD
@@ -48,8 +48,8 @@ tf_cuda_cc_test(
     # Disabled on jenkins until errors finding nvmlShutdown are found.
     tags = [
         "manual",
-        "multi_gpu",
         "no_oss",
+        "noguitar",  # note: is run manually there
         "notap",
     ],
     deps = if_cuda(
@@ -138,8 +138,8 @@ cuda_py_test(
     # Disabled on jenkins until errors finding nvmlShutdown are found.
     tags = [
         "manual",
-        "multi_gpu",
         "no_oss",
+        "noguitar",  # note: is run manually there
         "notap",
     ],
 )
diff --git a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
index bad0abd44c..0b13e3595e 100644
--- a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
+++ b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
@@ -72,15 +72,14 @@ class NcclTestCase(test.TestCase):
           two.
       device_sets: Tuple of virtual devices to run test on.
     """
+    if not test.is_gpu_available():
+      return  # Test requires access to a GPU
+
     for dtype in [np.float32, np.int32, np.int64, np.float64]:
       # Create session inside outer loop to test use of
       # same communicator across multiple sessions.
       with self.test_session(use_gpu=True) as sess:
 
-        # Check GPU availability *after* creating test session, see b/68975239.
-        if not test.is_gpu_available():
-          return  # Test requires access to a GPU
-
         for devices in device_sets:
           shape = (3, 4)
           random = (np.random.random_sample(shape) - .5) * 1024
diff --git a/tensorflow/contrib/nn/__init__.py b/tensorflow/contrib/nn/__init__.py
index 3bf795d19a..0bc133a00e 100644
--- a/tensorflow/contrib/nn/__init__.py
+++ b/tensorflow/contrib/nn/__init__.py
@@ -15,6 +15,7 @@
 """Module for variants of ops in tf.nn.
 
 @@alpha_dropout
+@@conv1d_transpose
 @@deprecated_flipped_softmax_cross_entropy_with_logits
 @@deprecated_flipped_sparse_softmax_cross_entropy_with_logits
 @@deprecated_flipped_sigmoid_cross_entropy_with_logits
@@ -32,6 +33,7 @@ from tensorflow.contrib.nn.python.ops.alpha_dropout import *
 from tensorflow.contrib.nn.python.ops.cross_entropy import *
 from tensorflow.contrib.nn.python.ops.sampling_ops import *
 from tensorflow.contrib.nn.python.ops.scaled_softplus import *
+from tensorflow.python.ops.nn_ops import conv1d_transpose
 from tensorflow.python.ops.nn_ops import nth_element
 # pylint: enable=unused-import,wildcard-import
 
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index 096d2270e4..022e5ab06f 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -18,6 +18,7 @@ py_library(
         "python/training/external_optimizer.py",
         "python/training/lazy_adam_optimizer.py",
         "python/training/moving_average_optimizer.py",
+        "python/training/multitask_optimizer_wrapper.py",
         "python/training/nadam_optimizer.py",
         "python/training/variable_clipping_optimizer.py",
     ],
@@ -96,6 +97,23 @@ py_test(
 )
 
 py_test(
+    name = "multitask_optimizer_wrapper_test",
+    srcs = ["python/training/multitask_optimizer_wrapper_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
     name = "lazy_adam_optimizer_test",
     srcs = ["python/training/lazy_adam_optimizer_test.py"],
     srcs_version = "PY2AND3",
diff --git a/tensorflow/contrib/opt/__init__.py b/tensorflow/contrib/opt/__init__.py
index e194fa2d4d..af47e3937a 100644
--- a/tensorflow/contrib/opt/__init__.py
+++ b/tensorflow/contrib/opt/__init__.py
@@ -24,7 +24,7 @@ from tensorflow.contrib.opt.python.training.external_optimizer import *
 from tensorflow.contrib.opt.python.training.lazy_adam_optimizer import *
 from tensorflow.contrib.opt.python.training.nadam_optimizer import *
 from tensorflow.contrib.opt.python.training.moving_average_optimizer import *
-from tensorflow.contrib.opt.python.training.nadam_optimizer import *
+from tensorflow.contrib.opt.python.training.multitask_optimizer_wrapper import *
 from tensorflow.contrib.opt.python.training.variable_clipping_optimizer import *
 # pylint: enable=wildcard-import
 
@@ -35,7 +35,8 @@ _allowed_symbols = [
     'DelayCompensatedGradientDescentOptimizer',
     'DropStaleGradientOptimizer', 'ExternalOptimizerInterface',
     'LazyAdamOptimizer', 'NadamOptimizer', 'MovingAverageOptimizer',
-    'ScipyOptimizerInterface', 'VariableClippingOptimizer'
+    'ScipyOptimizerInterface', 'VariableClippingOptimizer',
+    'MultitaskOptimizerWrapper', 'clip_gradients_by_global_norm',
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper.py b/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper.py
new file mode 100644
index 0000000000..c26037935d
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper.py
@@ -0,0 +1,138 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""An optimizer wrapper that ensures correct behaviour
+of stateful optimizers with multitask loss."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import types
+import six
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import optimizer
+
+__all__ = ["MultitaskOptimizerWrapper",
+           "clip_gradients_by_global_norm"]
+
+def _is_all_zeros(grad):
+  all_zeros = math_ops.equal(math_ops.count_nonzero(grad), 0)
+  return all_zeros
+
+def _get_wrapper(fn, opt):
+  def wrapper(self, grad, *args, **kwargs):  # pylint: disable=unused-argument
+    all_zeros = _is_all_zeros(grad)
+    return control_flow_ops.cond(
+        all_zeros,
+        control_flow_ops.no_op,
+        lambda: fn(grad, *args, **kwargs))
+  wrapper = types.MethodType(wrapper, opt)
+  return wrapper
+
+class MultitaskOptimizerWrapper(object):
+  """Optimizer wrapper that ensures that
+  all-zero gradients don't affect the optimizer state.
+
+  This might be useful when a multi-task loss is used,
+  and some components of the loss might be
+  not present (e.g. masked out) in some training batches.
+  Technically their gradient would be zero,
+  which would normally affect the optimizer state
+  (e.g. push running average to zero).
+  However this is not the desired behaviour,
+  since the missing loss component
+  should be treated as unknown rather than zero.
+
+  This wrapper filters out all-zero gradient tensors,
+  therefore preserving the optimizer state.
+
+  If gradient clipping by global norm is used,
+  the provided function clip_gradients_by_global_norm
+  should be used (and specified explicitly by the user).
+  Otherwise the global norm would be underestimated
+  because of all-zero tensors that should be ignored.
+
+  The gradient calculation and application
+  are delegated to an underlying optimizer.
+  The gradient application is altered only for all-zero tensors.
+
+  Example:
+  ```python
+  momentum_optimizer = tf.train.MomentumOptimizer(
+    learning_rate, momentum=0.9)
+  multitask_momentum_optimizer = tf.contrib.opt.MultitaskOptimizerWrapper(
+    momentum_optimizer)
+  gradvars = multitask_momentum_optimizer.compute_gradients(
+    loss)
+  gradvars_clipped, _ = tf.contrib.opt.clip_gradients_by_global_norm(
+    gradvars, 15.0)
+  train_op = multitask_momentum_optimizer.apply_gradients(
+    gradvars_clipped, global_step=batch)
+  ```
+  """
+  def __init__(self, opt):
+    """
+    Args:
+    opt: an instance of a class that implements tf.train.Optimizer.
+    """
+    if not isinstance(opt, optimizer.Optimizer):
+      raise TypeError(
+          "Supplied optimizer must be an instance of tf.train.Optimizer")
+    self._opt = opt
+    overriden_methods = ('_apply_dense',
+                         '_resource_apply_dense',
+                         '_apply_sparse',
+                         '_resource_apply_sparse')
+    for name in overriden_methods:
+      fn = getattr(self._opt, name)
+      wrapper = _get_wrapper(fn, self._opt)
+      setattr(self._opt, name, wrapper)
+
+  def __getattr__(self, name):
+    return getattr(self._opt, name)
+
+
+def clip_gradients_by_global_norm(gradients_variables, clip_norm=20.):
+  """Clips gradients of a multitask loss by their global norm.
+  Ignores all-zero tensors when computing the global norm.
+
+  Args:
+  gradients_variables: a list of pairs (gradient, variable).
+  clip_norm: a float Tensor, the global norm to clip on. Default is 20.0.
+
+  Returns:
+  list: A list of pairs of the same type as gradients_variables,.
+  fixed_global_norm: A 0-D (scalar) Tensor representing the global norm.
+  """
+  gradients, variables = six.moves.zip(*gradients_variables)
+  def _replace_nonexisting_grad(grad):
+    if grad is None:
+      return grad
+    all_zeros = _is_all_zeros(grad)
+    return control_flow_ops.cond(all_zeros,
+                                 lambda: array_ops.zeros(
+                                     [], dtype=dtypes.as_dtype(grad.dtype)),
+                                 lambda: grad)
+  nonzero_gradients = [_replace_nonexisting_grad(g) for g in gradients]
+  fixed_global_norm = clip_ops.global_norm(nonzero_gradients)
+  gradients, _ = clip_ops.clip_by_global_norm(gradients, clip_norm,
+                                              use_norm=fixed_global_norm)
+  return list(six.moves.zip(gradients, variables)), fixed_global_norm
diff --git a/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper_test.py b/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper_test.py
new file mode 100644
index 0000000000..b06213f715
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper_test.py
@@ -0,0 +1,119 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MultitaskOptimizerWrapper."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.opt.python.training import multitask_optimizer_wrapper
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import momentum
+
+import numpy as np
+import six
+
+class MultitaskOptimizerWrapperTest(test.TestCase):
+  """
+  Tests for the multitask optimizer wrapper.
+  """
+  def testWrapper(self):
+    with self.test_session():
+      var0 = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
+      var1 = variables.Variable([3.0, 4.0], dtype=dtypes.float32)
+      grads0 = constant_op.constant([0.1, 0.1], dtype=dtypes.float32)
+      grads1 = constant_op.constant([0.01, 0.01], dtype=dtypes.float32)
+      grads_allzero = constant_op.constant([0.0, 0.0], dtype=dtypes.float32)
+      mom_opt_impl = momentum.MomentumOptimizer(
+          learning_rate=2.0, momentum=0.9)
+      mom_opt = multitask_optimizer_wrapper.MultitaskOptimizerWrapper(
+          mom_opt_impl)
+      mom_update = mom_opt.apply_gradients(
+          zip([grads0, grads1], [var0, var1]))
+      mom_update_partial = mom_opt.apply_gradients(
+          zip([grads_allzero, grads1], [var0, var1]))
+      mom_update_no_action = mom_opt.apply_gradients(
+          zip([grads_allzero, grads_allzero], [var0, var1]))
+      self.evaluate(variables.global_variables_initializer())
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+      self.assertEqual(["momentum"], mom_opt.get_slot_names())
+      slot0 = mom_opt.get_slot(var0, "momentum")
+      self.assertEquals(slot0.get_shape(), var0.get_shape())
+      slot1 = mom_opt.get_slot(var1, "momentum")
+      self.assertEquals(slot1.get_shape(), var1.get_shape())
+
+      # Step 1: normal momentum update.
+      self.evaluate(mom_update)
+      # Check that the momentum accumulators have been updated.
+      self.assertAllCloseAccordingToType(np.array([0.1, 0.1]),
+                                         self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(np.array([0.01, 0.01]),
+                                         self.evaluate(slot1))
+      # Check that the parameters have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+          self.evaluate(var0))
+      self.assertAllCloseAccordingToType(
+          np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+          self.evaluate(var1))
+
+      # Step 2: momentum update that changes only slot1 but not slot0.
+      self.evaluate(mom_update_partial)
+      # Check that only the relevant momentum accumulator has been updated.
+      self.assertAllCloseAccordingToType(np.array([0.1, 0.1]),
+                                         self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(
+          np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+          self.evaluate(slot1))
+
+      # Step 3: momentum update that does not change anything.
+      self.evaluate(mom_update_no_action)
+      # Check that the momentum accumulators have *NOT* been updated.
+      self.assertAllCloseAccordingToType(np.array([0.1, 0.1]),
+                                         self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(
+          np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+          self.evaluate(slot1))
+
+  def testGradientClipping(self):
+    with self.test_session():
+      var0 = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
+      var1 = variables.Variable([3.0, 4.0], dtype=dtypes.float32)
+      var2 = variables.Variable([3.0, 4.0], dtype=dtypes.float32)
+      var3 = variables.Variable([3.0, 4.0], dtype=dtypes.float32)
+      grads0 = constant_op.constant([10.0, 15.0], dtype=dtypes.float32)
+      grads1 = constant_op.constant([0.0, 5.0], dtype=dtypes.float32)
+      grads2 = constant_op.constant([0.0, 0.0], dtype=dtypes.float32)
+      grads3 = None
+      varlist = [var0, var1, var2, var3]
+      gradients = [grads0, grads1, grads2, grads3]
+      clipped_gradvars, global_norm = multitask_optimizer_wrapper.clip_gradients_by_global_norm(
+          six.moves.zip(gradients, varlist), clip_norm=1.0)
+      clipped_grads = list(six.moves.zip(*clipped_gradvars))[0]
+      reference_global_norm = np.sqrt(np.sum(np.square([10.0, 15.0, 0.0, 5.0])))
+      self.assertAllCloseAccordingToType(
+          self.evaluate(global_norm), reference_global_norm)
+      self.assertAllCloseAccordingToType(
+          self.evaluate(clipped_grads[2]), np.array([0., 0.]))
+      self.assertEqual(clipped_grads[3], None)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD
index 45a98c7f85..935af80e7a 100644
--- a/tensorflow/contrib/quantize/BUILD
+++ b/tensorflow/contrib/quantize/BUILD
@@ -133,6 +133,7 @@ py_library(
     deps = [
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
@@ -142,23 +143,6 @@ py_library(
     ],
 )
 
-py_test(
-    name = "quant_ops_test",
-    size = "small",
-    srcs = ["python/quant_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":quant_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:session",
-        "//tensorflow/python:variables",
-    ],
-)
-
 py_library(
     name = "quantize",
     srcs = ["python/quantize.py"],
diff --git a/tensorflow/contrib/quantize/python/quant_ops.py b/tensorflow/contrib/quantize/python/quant_ops.py
index f80d427ff0..0a38ef9fcd 100644
--- a/tensorflow/contrib/quantize/python/quant_ops.py
+++ b/tensorflow/contrib/quantize/python/quant_ops.py
@@ -22,12 +22,15 @@ from tensorflow.contrib.framework.python.ops import add_arg_scope
 from tensorflow.contrib.framework.python.ops import model_variable
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import moving_averages
 
+EPSILON = 1e-5
+
 
 @add_arg_scope
 def FixedQuantize(inputs, init_min=-6.0, init_max=6.0, scope=None):
@@ -130,10 +133,12 @@ def LastValueQuantize(inputs,
         batch_min = inputs
     else:
       batch_min = math_ops.reduce_min(inputs, name='BatchMin')
-    # TFLite requires that 0.0 if always in the [min; max] range.
+    batch_min -= EPSILON
+    # B-eng requires that 0.0 if always in the [min; max] range.
     batch_min = math_ops.minimum(batch_min, 0.0)
-    assign_min = state_ops.assign(min_var, batch_min, name='AssignMinLast')
-    ops.add_to_collection(updates_collection, assign_min.op)
+    assign_min_op = state_ops.assign(
+        min_var, batch_min, name='AssignMinLast').op
+    ops.add_to_collection(updates_collection, assign_min_op)
 
     if per_channel:
       if input_dim >= 2:
@@ -143,15 +148,17 @@ def LastValueQuantize(inputs,
         batch_max = inputs
     else:
       batch_max = math_ops.reduce_max(inputs, name='BatchMax')
-    # TFLite requires that 0.0 if always in the [min; max] range.
+    batch_max += EPSILON
+    # B-eng requires that 0.0 if always in the [min; max] range.
     batch_max = math_ops.maximum(batch_max, 0.0)
-    assign_max = state_ops.assign(max_var, batch_max, name='AssignMaxLast')
-    ops.add_to_collection(updates_collection, assign_max.op)
+    assign_max_op = state_ops.assign(
+        max_var, batch_max, name='AssignMaxLast').op
+    ops.add_to_collection(updates_collection, assign_max_op)
 
     return _FakeQuantWithMinMaxVars(
         inputs,
-        assign_min,
-        assign_max,
+        batch_min,
+        batch_max,
         per_channel=per_channel,
         num_bits=num_bits,
         narrow_range=narrow_range)
@@ -244,9 +251,9 @@ def MovingAvgQuantize(inputs,
       batch_min = math_ops.reduce_min(inputs, name='BatchMin')
     # B-eng requires that 0.0 if always in the [min; max] range.
     batch_min = math_ops.minimum(batch_min, 0.0)
-    assign_min = moving_averages.assign_moving_average(
-        min_var, batch_min, ema_decay, name='AssignMinEma')
-    ops.add_to_collection(updates_collection, assign_min.op)
+    assign_min_op = moving_averages.assign_moving_average(
+        min_var, batch_min, ema_decay, name='AssignMinEma').op
+    ops.add_to_collection(updates_collection, assign_min_op)
 
     if per_channel:
       if input_dim >= 2:
@@ -258,14 +265,14 @@ def MovingAvgQuantize(inputs,
       batch_max = math_ops.reduce_max(inputs, name='BatchMax')
     # B-eng requires that 0.0 if always in the [min; max] range.
     batch_max = math_ops.maximum(batch_max, 0.0)
-    assign_max = moving_averages.assign_moving_average(
-        max_var, batch_max, ema_decay, name='AssignMaxEma')
-    ops.add_to_collection(updates_collection, assign_max.op)
+    assign_max_op = moving_averages.assign_moving_average(
+        max_var, batch_max, ema_decay, name='AssignMaxEma').op
+    ops.add_to_collection(updates_collection, assign_max_op)
 
     return _FakeQuantWithMinMaxVars(
         inputs,
-        assign_min,
-        assign_max,
+        min_var,
+        max_var,
         per_channel=per_channel,
         num_bits=num_bits,
         narrow_range=narrow_range)
@@ -294,10 +301,20 @@ def _FakeQuantWithMinMaxVars(inputs, min_var, max_var, per_channel, num_bits,
   if per_channel:
     assert len(min_var.get_shape()) == 1
     assert len(max_var.get_shape()) == 1
-    return array_ops.fake_quant_with_min_max_vars_per_channel(
-        inputs, min_var, max_var, num_bits=num_bits, narrow_range=narrow_range)
+    with ops.control_dependencies([check_ops.assert_less(min_var, max_var)]):
+      return array_ops.fake_quant_with_min_max_vars_per_channel(
+          inputs,
+          min_var,
+          max_var,
+          num_bits=num_bits,
+          narrow_range=narrow_range)
   else:
     assert min_var.get_shape() == []  # pylint: disable=g-explicit-bool-comparison
     assert max_var.get_shape() == []  # pylint: disable=g-explicit-bool-comparison
-    return array_ops.fake_quant_with_min_max_vars(
-        inputs, min_var, max_var, num_bits=num_bits, narrow_range=narrow_range)
+    with ops.control_dependencies([check_ops.assert_less(min_var, max_var)]):
+      return array_ops.fake_quant_with_min_max_vars(
+          inputs,
+          min_var,
+          max_var,
+          num_bits=num_bits,
+          narrow_range=narrow_range)
diff --git a/tensorflow/contrib/quantize/python/quant_ops_test.py b/tensorflow/contrib/quantize/python/quant_ops_test.py
deleted file mode 100644
index 3884679602..0000000000
--- a/tensorflow/contrib/quantize/python/quant_ops_test.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for third_party.tensorflow.contrib.quantize.python.quant_ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.quantize.python import quant_ops
-from tensorflow.python.client import session
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import googletest
-
-_MIN_MAX_VARS = 'min_max_vars'
-
-
-class QuantOpsTest(googletest.TestCase):
-
-  def testLastValueQuantizeTrainingAssign(self):
-    g = ops.Graph()
-    with session.Session(graph=g) as sess:
-      x = array_ops.placeholder(dtypes.float32, shape=[2])
-      y = quant_ops.LastValueQuantize(
-          x,
-          init_min=0.0,
-          init_max=0.0,
-          is_training=True,
-          vars_collection=_MIN_MAX_VARS)
-
-      # Run the step.
-      sess.run(variables.global_variables_initializer())
-      sess.run(y, feed_dict={x: [-1.0, 1.0]})
-      # Now check that the min_max_vars were, in fact, updated.
-      min_value, max_value = self._GetMinMaxValues(sess)
-      self.assertEqual(min_value, -1.0)
-      self.assertEqual(max_value, 1.0)
-
-  def testMovingAvgQuantizeTrainingAssign(self):
-    g = ops.Graph()
-    with session.Session(graph=g) as sess:
-      x = array_ops.placeholder(dtypes.float32, shape=[2])
-      y = quant_ops.MovingAvgQuantize(
-          x,
-          init_min=0.0,
-          init_max=0.0,
-          is_training=True,
-          vars_collection=_MIN_MAX_VARS)
-
-      # Run the step.
-      sess.run(variables.global_variables_initializer())
-      # Do two runs to avoid zero debias.
-      sess.run(y, feed_dict={x: [-1.0, 1.0]})
-      sess.run(y, feed_dict={x: [0.0, 0.0]})
-      # Now check that the min_max_vars were, in fact, updated.
-      min_value, max_value = self._GetMinMaxValues(sess)
-      self.assertGreater(min_value, -1.0)
-      self.assertLess(min_value, 0.0)
-      self.assertGreater(max_value, 0.0)
-      self.assertLess(max_value, 1.0)
-
-  def _GetMinMaxValues(self, sess):
-    min_max_vars = ops.get_collection(_MIN_MAX_VARS)
-    self.assertEqual(len(min_max_vars), 2)
-    min_idx = 0 if 'min' in min_max_vars[0].name else 1
-    max_idx = (min_idx + 1) % 2
-    min_var, max_var = min_max_vars[min_idx], min_max_vars[max_idx]
-    min_max_values = sess.run([min_var, max_var])
-    return min_max_values[0], min_max_values[1]
-
-
-if __name__ == '__main__':
-  googletest.main()
diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index 7db2d863aa..548e33663e 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -89,8 +89,8 @@ def Quantize(graph,
           op.name[:-len('/depthwise')])
       if separable_conv and separable_conv.type == 'Conv2D':
         continue
-    # Quantize add ops that come after Conv2D or DepthwiseConv2dNative.
-    if op.type in ['Conv2D', 'DepthwiseConv2dNative']:
+    if op.type == 'Conv2D':
+      # Quantize add ops that come after Conv2D
       add_context_re = re.search(r'^(.*)/[^/]+/', op.name)
       if add_context_re is not None:
         context.add_contexts.add(add_context_re.group(1))
@@ -387,7 +387,7 @@ class _QuantizeContext(object):
 
     if delay_requested and self.quant_delay and self.quant_delay > 0:
       activate_quant = math_ops.greater_equal(
-          training_util.get_or_create_global_step(),
+          training_util.get_global_step(),
           self.quant_delay,
           name=scope + '/activate_quant')
       quant = control_flow_ops.cond(
diff --git a/tensorflow/contrib/quantize/python/quantize_parameterized_test.py b/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
index 57dab03f16..3e62f95bd6 100644
--- a/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
@@ -97,8 +97,8 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                 quantization_node_name)
     self.assertEqual(weights_quant.type, quantization_node_name)
     expected_inputs = [
-        scope + '/weights_quant/AssignMinLast',
-        scope + '/weights_quant/AssignMaxLast', scope + '/weights/read'
+        scope + '/weights_quant/Minimum', scope + '/weights_quant/Maximum',
+        scope + '/weights/read'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
     output_op_name = scope + '/Conv2D'
@@ -109,8 +109,8 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                quantization_node_name)
       self.assertEqual(conv_quant.type, quantization_node_name)
       expected_inputs = [
-          scope + '/conv_quant/AssignMinEma',
-          scope + '/conv_quant/AssignMaxEma', scope + '/BiasAdd'
+          scope + '/conv_quant/min/read', scope + '/conv_quant/max/read',
+          scope + '/BiasAdd'
       ]
       self._AssertInputOpsAre(conv_quant, expected_inputs)
       output_op_name = (scope + '/conv_quant/delayed_quant/Switch_1'
@@ -122,7 +122,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     self.assertEqual(act_quant.type, quantization_node_name)
 
     expected_inputs = [
-        'test/act_quant/AssignMinEma', 'test/act_quant/AssignMaxEma',
+        'test/act_quant/min/read', 'test/act_quant/max/read',
         'test/' + activation_op_name
     ]
     self._AssertInputOpsAre(act_quant, expected_inputs)
@@ -172,8 +172,8 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                 quantization_node_name)
     self.assertEqual(weights_quant.type, quantization_node_name)
     expected_inputs = [
-        scope + '/weights_quant/AssignMinLast',
-        scope + '/weights_quant/AssignMaxLast', scope + '/weights/read'
+        scope + '/weights_quant/Minimum', scope + '/weights_quant/Maximum',
+        scope + '/weights/read'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
     output_op_name = scope + '/MatMul'
@@ -184,8 +184,8 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                quantization_node_name)
       self.assertEqual(conv_quant.type, quantization_node_name)
       expected_inputs = [
-          scope + '/conv_quant/AssignMinEma',
-          scope + '/conv_quant/AssignMaxEma', scope + '/BiasAdd'
+          scope + '/conv_quant/min/read', scope + '/conv_quant/max/read',
+          scope + '/BiasAdd'
       ]
       self._AssertInputOpsAre(conv_quant, expected_inputs)
       output_op_name = (scope + '/conv_quant/delayed_quant/Switch_1'
@@ -196,7 +196,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                             quantization_node_name)
     self.assertEqual(act_quant.type, quantization_node_name)
     expected_inputs = [
-        'test/act_quant/AssignMinEma', 'test/act_quant/AssignMaxEma',
+        'test/act_quant/min/read', 'test/act_quant/max/read',
         'test/' + activation_op_name
     ]
     self._AssertInputOpsAre(act_quant, expected_inputs)
@@ -247,8 +247,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                 quantization_node_name)
     self.assertEqual(weights_quant.type, quantization_node_name)
     expected_inputs = [
-        scope + '/weights_quant/AssignMinLast',
-        scope + '/weights_quant/AssignMaxLast',
+        scope + '/weights_quant/Minimum', scope + '/weights_quant/Maximum',
         scope + '/depthwise_weights/read'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
@@ -260,8 +259,8 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                quantization_node_name)
       self.assertEqual(conv_quant.type, quantization_node_name)
       expected_inputs = [
-          scope + '/conv_quant/AssignMinEma',
-          scope + '/conv_quant/AssignMaxEma', scope + '/BiasAdd'
+          scope + '/conv_quant/min/read', scope + '/conv_quant/max/read',
+          scope + '/BiasAdd'
       ]
       self._AssertInputOpsAre(conv_quant, expected_inputs)
       output_op_name = (scope + '/conv_quant/delayed_quant/Switch_1'
@@ -272,7 +271,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                             quantization_node_name)
     self.assertEqual(act_quant.type, quantization_node_name)
     expected_inputs = [
-        'test/act_quant/AssignMinEma', 'test/act_quant/AssignMaxEma',
+        'test/act_quant/min/read', 'test/act_quant/max/read',
         'test/' + activation_op_name
     ]
     self._AssertInputOpsAre(act_quant, expected_inputs)
@@ -402,10 +401,8 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                 quantization_node_name)
     self.assertEqual(weights_quant.type, quantization_node_name)
     expected_inputs = [
-        scope + '/weights_quant/' + ('AssignMinEma'
-                                     if use_ema else 'AssignMinLast'),
-        scope + '/weights_quant/' + ('AssignMaxEma'
-                                     if use_ema else 'AssignMaxLast'),
+        scope + '/weights_quant/' + ('min/read' if use_ema else 'Minimum'),
+        scope + '/weights_quant/' + ('max/read' if use_ema else 'Maximum'),
         scope + '/mul_fold'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
@@ -418,8 +415,8 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                quantization_node_name)
       self.assertEqual(conv_quant.type, quantization_node_name)
       expected_inputs = [
-          scope + '/conv_quant/AssignMinEma',
-          scope + '/conv_quant/AssignMaxEma', scope + '/add_fold'
+          scope + '/conv_quant/min/read', scope + '/conv_quant/max/read',
+          scope + '/add_fold'
       ]
       self._AssertInputOpsAre(conv_quant, expected_inputs)
       output_op_name = (scope + '/conv_quant/delayed_quant/Switch_1'
@@ -430,7 +427,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                             quantization_node_name)
     self.assertEqual(act_quant.type, quantization_node_name)
     expected_inputs = [
-        'test/act_quant/AssignMinEma', 'test/act_quant/AssignMaxEma',
+        'test/act_quant/min/read', 'test/act_quant/max/read',
         'test/' + activation_op_name
     ]
     self._AssertInputOpsAre(act_quant, expected_inputs)
@@ -521,10 +518,8 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                 quantization_node_name)
     self.assertEqual(weights_quant.type, quantization_node_name)
     expected_inputs = [
-        scope + '/weights_quant/' + ('AssignMinEma'
-                                     if use_ema else 'AssignMinLast'),
-        scope + '/weights_quant/' + ('AssignMaxEma'
-                                     if use_ema else 'AssignMaxLast'),
+        scope + '/weights_quant/' + ('min/read' if use_ema else 'Minimum'),
+        scope + '/weights_quant/' + ('max/read' if use_ema else 'Maximum'),
         scope + '/mul_fold'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
@@ -537,8 +532,8 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                quantization_node_name)
       self.assertEqual(conv_quant.type, quantization_node_name)
       expected_inputs = [
-          scope + '/conv_quant/AssignMinEma',
-          scope + '/conv_quant/AssignMaxEma', scope + '/add_fold'
+          scope + '/conv_quant/min/read', scope + '/conv_quant/max/read',
+          scope + '/add_fold'
       ]
       self._AssertInputOpsAre(conv_quant, expected_inputs)
       output_op_name = (scope + '/conv_quant/delayed_quant/Switch_1'
@@ -549,7 +544,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                             quantization_node_name)
     self.assertEqual(act_quant.type, quantization_node_name)
     expected_inputs = [
-        'test/act_quant/AssignMinEma', 'test/act_quant/AssignMaxEma',
+        'test/act_quant/min/read', 'test/act_quant/max/read',
         'test/' + activation_op_name
     ]
     self._AssertInputOpsAre(act_quant, expected_inputs)
@@ -644,10 +639,8 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                 quantization_node_name)
     self.assertEqual(weights_quant.type, quantization_node_name)
     expected_inputs = [
-        scope + '/weights_quant/' + ('AssignMinEma'
-                                     if use_ema else 'AssignMinLast'),
-        scope + '/weights_quant/' + ('AssignMaxEma'
-                                     if use_ema else 'AssignMaxLast'),
+        scope + '/weights_quant/' + ('min/read' if use_ema else 'Minimum'),
+        scope + '/weights_quant/' + ('max/read' if use_ema else 'Maximum'),
         scope + '/mul_fold'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
@@ -660,8 +653,8 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                quantization_node_name)
       self.assertEqual(conv_quant.type, quantization_node_name)
       expected_inputs = [
-          scope + '/conv_quant/AssignMinEma',
-          scope + '/conv_quant/AssignMaxEma', scope + '/add_fold'
+          scope + '/conv_quant/min/read', scope + '/conv_quant/max/read',
+          scope + '/add_fold'
       ]
       self._AssertInputOpsAre(conv_quant, expected_inputs)
       output_op_name = (scope + '/conv_quant/delayed_quant/Switch_1'
@@ -672,7 +665,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                             quantization_node_name)
     self.assertEqual(act_quant.type, quantization_node_name)
     expected_inputs = [
-        'test/act_quant/AssignMinEma', 'test/act_quant/AssignMaxEma',
+        'test/act_quant/min/read', 'test/act_quant/max/read',
         'test/' + activation_op_name
     ]
     self._AssertInputOpsAre(act_quant, expected_inputs)
diff --git a/tensorflow/contrib/quantize/python/quantize_test.py b/tensorflow/contrib/quantize/python/quantize_test.py
index 1e4dd7cf67..eb141a21bd 100644
--- a/tensorflow/contrib/quantize/python/quantize_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_test.py
@@ -30,7 +30,6 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import googletest
 
 conv2d = layers.conv2d
-separable_conv2d = layers.separable_conv2d
 
 
 class QuantizeTest(test_util.TensorFlowTestCase):
@@ -78,30 +77,6 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                             quantization_node_name)
     self.assertEqual(add_quant.type, quantization_node_name)
 
-  def testInsertQuantOpForAddAfterSeparableConv2d(self):
-    graph = ops.Graph()
-    with graph.as_default():
-      batch_size, height, width, depth = 5, 128, 128, 3
-      input1 = array_ops.zeros((batch_size, height, width, depth))
-      input2 = array_ops.zeros((batch_size, height / 2, width / 2, depth))
-      conv = separable_conv2d(input1, None, [5, 5], stride=2,
-                              depth_multiplier=1.0, padding='SAME',
-                              weights_initializer=self._WeightInit(0.09),
-                              activation_fn=None, scope='test/test')
-      node = math_ops.add(conv, input2, name='test/add')
-      node = array_ops.identity(node, name='test/identity')
-      update_barrier = control_flow_ops.no_op(name='update_barrier')
-      with ops.control_dependencies([update_barrier]):
-        array_ops.identity(node, name='control_dependency')
-
-    quantize.Quantize(graph=graph, weight_bits=8, weight_narrow_range=True,
-                      activation_bits=8)
-
-    quantization_node_name = 'FakeQuantWithMinMaxVars'
-    add_quant = graph.get_operation_by_name('test/add_quant/' +
-                                            quantization_node_name)
-    self.assertEqual(add_quant.type, quantization_node_name)
-
   def _WeightInit(self, stddev):
     """Returns truncated normal variable initializer.
 
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index 909c6aba2b..16b6d145e3 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -38,6 +38,9 @@ from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
+from tensorflow.python.framework import test_util
+from tensorflow.contrib.rnn.python.ops import rnn_cell as contrib_rnn_cell
+
 
 
 # pylint: enable=protected-access
@@ -358,6 +361,45 @@ class RNNCellTest(test.TestCase):
       self.assertEquals(variables[2].op.name,
                         "root/lstm_cell/projection/kernel")
 
+  def testLSTMCellLayerNorm(self):
+    with self.test_session() as sess:
+      num_units = 2
+      num_proj = 3
+      batch_size = 1
+      input_size = 4
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([batch_size, input_size])
+        c = array_ops.zeros([batch_size, num_units])
+        h = array_ops.zeros([batch_size, num_proj])
+        state = rnn_cell_impl.LSTMStateTuple(c, h)
+        cell = contrib_rnn_cell.LayerNormLSTMCell(
+          num_units=num_units,
+          num_proj=num_proj,
+          forget_bias=1.0,
+          layer_norm=True,
+          norm_gain=1.0,
+          norm_shift=0.0)
+        g, out_m = cell(x, state)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run([g, out_m], {
+          x.name: np.ones((batch_size, input_size)),
+          c.name: 0.1 * np.ones((batch_size, num_units)),
+          h.name: 0.1 * np.ones((batch_size, num_proj))
+        })
+        self.assertEqual(len(res), 2)
+        # The numbers in results were not calculated, this is mostly just a
+        # smoke test.
+        self.assertEqual(res[0].shape, (batch_size, num_proj))
+        self.assertEqual(res[1][0].shape, (batch_size, num_units))
+        self.assertEqual(res[1][1].shape, (batch_size, num_proj))
+        # Different inputs so different outputs and states
+        for i in range(1, batch_size):
+          self.assertTrue(
+            float(np.linalg.norm((res[0][0, :] - res[0][i, :]))) < 1e-6)
+          self.assertTrue(
+            float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) < 1e-6)
+
   def testOutputProjectionWrapper(self):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index ebd4564f12..b4a5f2d7eb 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -37,6 +37,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import rnn
 from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -1275,6 +1276,49 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
         self.assertAllClose(res[2].c, expected_c1, 1e-5)
         self.assertAllClose(res[2].h, expected_h1, 1e-5)
 
+
+  def testBasicLSTMCellWithStateTupleLayerNorm(self):
+    """The results of LSTMCell and LayerNormBasicLSTMCell 
+    should be same. """
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        c0 = array_ops.zeros([1, 2])
+        h0 = array_ops.zeros([1, 2])
+        state0 = rnn_cell_impl.LSTMStateTuple(c0, h0)
+        c1 = array_ops.zeros([1, 2])
+        h1 = array_ops.zeros([1, 2])
+        state1 = rnn_cell_impl.LSTMStateTuple(c1, h1)
+        cell = rnn_cell_impl.MultiRNNCell(
+          [contrib_rnn_cell.LayerNormLSTMCell(
+              2,
+              layer_norm=True,
+              norm_gain=1.0,
+              norm_shift=0.0) for _ in range(2)])
+        h, (s0, s1) = cell(x, (state0, state1))
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([h, s0, s1], {
+          x.name: np.array([[1., 1.]]),
+          c0.name: 0.1 * np.asarray([[0, 1]]),
+          h0.name: 0.1 * np.asarray([[2, 3]]),
+          c1.name: 0.1 * np.asarray([[4, 5]]),
+          h1.name: 0.1 * np.asarray([[6, 7]]),
+        })
+
+        expected_h = np.array([[-0.38079708, 0.38079708]])
+        expected_h0 = np.array([[-0.38079708, 0.38079708]])
+        expected_c0 = np.array([[-1.0, 1.0]])
+        expected_h1 = np.array([[-0.38079708, 0.38079708]])
+        expected_c1 = np.array([[-1.0, 1.0]])
+
+        self.assertEqual(len(res), 3)
+        self.assertAllClose(res[0], expected_h, 1e-5)
+        self.assertAllClose(res[1].c, expected_c0, 1e-5)
+        self.assertAllClose(res[1].h, expected_h0, 1e-5)
+        self.assertAllClose(res[2].c, expected_c1, 1e-5)
+        self.assertAllClose(res[2].h, expected_h1, 1e-5)
+
   def testBasicLSTMCellWithDropout(self):
 
     def _is_close(x, y, digits=4):
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index d4691f2c27..5e85c125df 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -36,6 +36,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
@@ -76,6 +77,18 @@ def _get_sharded_variable(name, shape, dtype, num_shards):
   return shards
 
 
+def _norm(g, b, inp, scope):
+  shape = inp.get_shape()[-1:]
+  gamma_init = init_ops.constant_initializer(g)
+  beta_init = init_ops.constant_initializer(b)
+  with vs.variable_scope(scope):
+    # Initialize beta and gamma for use by layer_norm.
+    vs.get_variable("gamma", shape=shape, initializer=gamma_init)
+    vs.get_variable("beta", shape=shape, initializer=beta_init)
+  normalized = layers.layer_norm(inp, reuse=True, scope=scope)
+  return normalized
+
+
 class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
   """Long short-term memory unit (LSTM) recurrent network cell.
 
@@ -102,13 +115,24 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
 
   The class uses optional peep-hole connections, and an optional projection
   layer.
+  
+  Layer normalization implementation is based on:
+
+    https://arxiv.org/abs/1607.06450.
+
+  "Layer Normalization"
+  Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton
+
+  and is applied before the internal nonlinearities.
+  
   """
 
   def __init__(self, num_units, use_peepholes=False,
                initializer=None, num_proj=None, proj_clip=None,
                num_unit_shards=1, num_proj_shards=1,
                forget_bias=1.0, state_is_tuple=True,
-               activation=math_ops.tanh, reuse=None):
+               activation=math_ops.tanh, reuse=None,
+               layer_norm=False, norm_gain=1.0, norm_shift=0.0):
     """Initialize the parameters for an LSTM cell.
 
     Args:
@@ -135,6 +159,13 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
       reuse: (optional) Python boolean describing whether to reuse variables
         in an existing scope.  If not `True`, and the existing scope already has
         the given variables, an error is raised.
+      layer_norm: If `True`, layer normalization will be applied.
+      norm_gain: float, The layer normalization gain initial value. If
+        `layer_norm` has been set to `False`, this argument will be ignored.
+      norm_shift: float, The layer normalization shift initial value. If
+        `layer_norm` has been set to `False`, this argument will be ignored.
+        
+        
     """
     super(CoupledInputForgetGateLSTMCell, self).__init__(_reuse=reuse)
     if not state_is_tuple:
@@ -152,6 +183,9 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
     self._state_is_tuple = state_is_tuple
     self._activation = activation
     self._reuse = reuse
+    self._layer_norm = layer_norm
+    self._norm_gain = norm_gain
+    self._norm_shift = norm_shift
 
     if num_proj:
       self._state_size = (rnn_cell_impl.LSTMStateTuple(num_units, num_proj)
@@ -220,9 +254,20 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
 
     # j = new_input, f = forget_gate, o = output_gate
     cell_inputs = array_ops.concat([inputs, m_prev], 1)
-    lstm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b)
+    lstm_matrix = math_ops.matmul(cell_inputs, concat_w)
+
+    # If layer nomalization is applied, do not add bias
+    if not self._layer_norm:
+      lstm_matrix = nn_ops.bias_add(lstm_matrix, b)
+
     j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=3, axis=1)
 
+    # Apply layer normalization
+    if self._layer_norm:
+      j = _norm(self._norm_gain, self._norm_shift, j, "transform")
+      f = _norm(self._norm_gain, self._norm_shift, f, "forget")
+      o = _norm(self._norm_gain, self._norm_shift, o, "output")
+
     # Diagonal connections
     if self._use_peepholes:
       w_f_diag = vs.get_variable(
@@ -236,6 +281,10 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
       f_act = sigmoid(f + self._forget_bias)
     c = (f_act * c_prev + (1 - f_act) * self._activation(j))
 
+    # Apply layer normalization
+    if self._layer_norm:
+      c = _norm(self._norm_gain, self._norm_shift, c, "state")
+
     if self._use_peepholes:
       m = sigmoid(o + w_o_diag * c) * self._activation(c)
     else:
@@ -1301,8 +1350,8 @@ class LayerNormBasicLSTMCell(rnn_cell_impl.RNNCell):
     self._keep_prob = dropout_keep_prob
     self._seed = dropout_prob_seed
     self._layer_norm = layer_norm
-    self._g = norm_gain
-    self._b = norm_shift
+    self._norm_gain = norm_gain
+    self._norm_shift = norm_shift
     self._reuse = reuse
 
   @property
@@ -1313,24 +1362,25 @@ class LayerNormBasicLSTMCell(rnn_cell_impl.RNNCell):
   def output_size(self):
     return self._num_units
 
-  def _norm(self, inp, scope):
+  def _norm(self, inp, scope, dtype=dtypes.float32):
     shape = inp.get_shape()[-1:]
-    gamma_init = init_ops.constant_initializer(self._g)
-    beta_init = init_ops.constant_initializer(self._b)
+    gamma_init = init_ops.constant_initializer(self._norm_gain)
+    beta_init = init_ops.constant_initializer(self._norm_shift)
     with vs.variable_scope(scope):
       # Initialize beta and gamma for use by layer_norm.
-      vs.get_variable("gamma", shape=shape, initializer=gamma_init)
-      vs.get_variable("beta", shape=shape, initializer=beta_init)
+      vs.get_variable("gamma", shape=shape, initializer=gamma_init, dtype=dtype)
+      vs.get_variable("beta", shape=shape, initializer=beta_init, dtype=dtype)
     normalized = layers.layer_norm(inp, reuse=True, scope=scope)
     return normalized
 
   def _linear(self, args):
     out_size = 4 * self._num_units
     proj_size = args.get_shape()[-1]
-    weights = vs.get_variable("kernel", [proj_size, out_size])
+    dtype = args.dtype
+    weights = vs.get_variable("kernel", [proj_size, out_size], dtype=dtype)
     out = math_ops.matmul(args, weights)
     if not self._layer_norm:
-      bias = vs.get_variable("bias", [out_size])
+      bias = vs.get_variable("bias", [out_size], dtype=dtype)
       out = nn_ops.bias_add(out, bias)
     return out
 
@@ -1339,13 +1389,14 @@ class LayerNormBasicLSTMCell(rnn_cell_impl.RNNCell):
     c, h = state
     args = array_ops.concat([inputs, h], 1)
     concat = self._linear(args)
+    dtype = args.dtype
 
     i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1)
     if self._layer_norm:
-      i = self._norm(i, "input")
-      j = self._norm(j, "transform")
-      f = self._norm(f, "forget")
-      o = self._norm(o, "output")
+      i = self._norm(i, "input", dtype=dtype)
+      j = self._norm(j, "transform", dtype=dtype)
+      f = self._norm(f, "forget", dtype=dtype)
+      o = self._norm(o, "output", dtype=dtype)
 
     g = self._activation(j)
     if (not isinstance(self._keep_prob, float)) or self._keep_prob < 1:
@@ -1354,7 +1405,7 @@ class LayerNormBasicLSTMCell(rnn_cell_impl.RNNCell):
     new_c = (c * math_ops.sigmoid(f + self._forget_bias)
              + math_ops.sigmoid(i) * g)
     if self._layer_norm:
-      new_c = self._norm(new_c, "state")
+      new_c = self._norm(new_c, "state", dtype=dtype)
     new_h = self._activation(new_c) * math_ops.sigmoid(o)
 
     new_state = rnn_cell_impl.LSTMStateTuple(new_c, new_h)
@@ -2306,3 +2357,264 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
 
     new_state = rnn_cell_impl.LSTMStateTuple(c, m)
     return m, new_state
+
+
+class LayerNormLSTMCell(rnn_cell_impl.RNNCell):
+  """Long short-term memory unit (LSTM) recurrent network cell.
+
+  The default non-peephole implementation is based on:
+
+    http://www.bioinf.jku.at/publications/older/2604.pdf
+
+  S. Hochreiter and J. Schmidhuber.
+  "Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
+
+  The peephole implementation is based on:
+
+    https://research.google.com/pubs/archive/43905.pdf
+
+  Hasim Sak, Andrew Senior, and Francoise Beaufays.
+  "Long short-term memory recurrent neural network architectures for
+   large scale acoustic modeling." INTERSPEECH, 2014.
+
+  The class uses optional peep-hole connections, optional cell clipping, and
+  an optional projection layer.
+
+  Layer normalization implementation is based on:
+
+    https://arxiv.org/abs/1607.06450.
+
+  "Layer Normalization"
+  Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton
+
+  and is applied before the internal nonlinearities.
+
+  """
+
+  def __init__(self, num_units,
+               use_peepholes=False, cell_clip=None,
+               initializer=None, num_proj=None, proj_clip=None,
+               forget_bias=1.0,
+               activation=None, layer_norm=False,
+               norm_gain=1.0, norm_shift=0.0, reuse=None):
+    """Initialize the parameters for an LSTM cell.
+
+    Args:
+      num_units: int, The number of units in the LSTM cell
+      use_peepholes: bool, set True to enable diagonal/peephole connections.
+      cell_clip: (optional) A float value, if provided the cell state is clipped
+        by this value prior to the cell output activation.
+      initializer: (optional) The initializer to use for the weight and
+        projection matrices.
+      num_proj: (optional) int, The output dimensionality for the projection
+        matrices.  If None, no projection is performed.
+      proj_clip: (optional) A float value.  If `num_proj > 0` and `proj_clip` is
+        provided, then the projected values are clipped elementwise to within
+        `[-proj_clip, proj_clip]`.
+      forget_bias: Biases of the forget gate are initialized by default to 1
+        in order to reduce the scale of forgetting at the beginning of
+        the training. Must set it manually to `0.0` when restoring from
+        CudnnLSTM trained checkpoints.
+      activation: Activation function of the inner states.  Default: `tanh`.
+      layer_norm: If `True`, layer normalization will be applied.
+      norm_gain: float, The layer normalization gain initial value. If
+        `layer_norm` has been set to `False`, this argument will be ignored.
+      norm_shift: float, The layer normalization shift initial value. If
+        `layer_norm` has been set to `False`, this argument will be ignored.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
+
+      When restoring from CudnnLSTM-trained checkpoints, must use
+      CudnnCompatibleLSTMCell instead.
+    """
+    super(LayerNormLSTMCell, self).__init__(_reuse=reuse)
+
+    self._num_units = num_units
+    self._use_peepholes = use_peepholes
+    self._cell_clip = cell_clip
+    self._initializer = initializer
+    self._num_proj = num_proj
+    self._proj_clip = proj_clip
+    self._forget_bias = forget_bias
+    self._activation = activation or math_ops.tanh
+    self._layer_norm = layer_norm
+    self._norm_gain = norm_gain
+    self._norm_shift = norm_shift
+
+    if num_proj:
+      self._state_size = (rnn_cell_impl.LSTMStateTuple(num_units, num_proj))
+      self._output_size = num_proj
+    else:
+      self._state_size = (rnn_cell_impl.LSTMStateTuple(num_units, num_units))
+      self._output_size = num_units
+
+  @property
+  def state_size(self):
+    return self._state_size
+
+  @property
+  def output_size(self):
+    return self._output_size
+
+
+  def _linear(self,
+              args,
+              output_size,
+              bias,
+              bias_initializer=None,
+              kernel_initializer=None,
+              layer_norm=False):
+    """Linear map: sum_i(args[i] * W[i]), where W[i] is a Variable.
+
+    Args:
+      args: a 2D Tensor or a list of 2D, batch x n, Tensors.
+      output_size: int, second dimension of W[i].
+      bias: boolean, whether to add a bias term or not.
+      bias_initializer: starting value to initialize the bias
+        (default is all zeros).
+      kernel_initializer: starting value to initialize the weight.
+      layer_norm: boolean, whether to apply layer normalization.
+
+
+    Returns:
+      A 2D Tensor with shape [batch x output_size] taking value
+      sum_i(args[i] * W[i]), where each W[i] is a newly created Variable.
+
+    Raises:
+      ValueError: if some of the arguments has unspecified or wrong shape.
+    """
+    if args is None or (nest.is_sequence(args) and not args):
+      raise ValueError("`args` must be specified")
+    if not nest.is_sequence(args):
+      args = [args]
+
+    # Calculate the total size of arguments on dimension 1.
+    total_arg_size = 0
+    shapes = [a.get_shape() for a in args]
+    for shape in shapes:
+      if shape.ndims != 2:
+        raise ValueError("linear is expecting 2D arguments: %s" % shapes)
+      if shape[1].value is None:
+        raise ValueError("linear expects shape[1] to be provided for shape %s, "
+                         "but saw %s" % (shape, shape[1]))
+      else:
+        total_arg_size += shape[1].value
+
+    dtype = [a.dtype for a in args][0]
+
+    # Now the computation.
+    scope = vs.get_variable_scope()
+    with vs.variable_scope(scope) as outer_scope:
+      weights = vs.get_variable(
+        "kernel", [total_arg_size, output_size],
+        dtype=dtype,
+        initializer=kernel_initializer)
+      if len(args) == 1:
+        res = math_ops.matmul(args[0], weights)
+      else:
+        res = math_ops.matmul(array_ops.concat(args, 1), weights)
+      if not bias:
+        return res
+      with vs.variable_scope(outer_scope) as inner_scope:
+        inner_scope.set_partitioner(None)
+        if bias_initializer is None:
+          bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype)
+        biases = vs.get_variable(
+          "bias", [output_size],
+          dtype=dtype,
+          initializer=bias_initializer)
+
+    if not layer_norm:
+      res = nn_ops.bias_add(res, biases)
+
+    return res
+
+  def call(self, inputs, state):
+    """Run one step of LSTM.
+
+    Args:
+      inputs: input Tensor, 2D, batch x num_units.
+      state: this must be a tuple of state Tensors,
+       both `2-D`, with column sizes `c_state` and
+        `m_state`.
+
+    Returns:
+      A tuple containing:
+
+      - A `2-D, [batch x output_dim]`, Tensor representing the output of the
+        LSTM after reading `inputs` when previous state was `state`.
+        Here output_dim is:
+           num_proj if num_proj was set,
+           num_units otherwise.
+      - Tensor(s) representing the new state of LSTM after reading `inputs` when
+        the previous state was `state`.  Same type and shape(s) as `state`.
+
+    Raises:
+      ValueError: If input size cannot be inferred from inputs via
+        static shape inference.
+    """
+    num_proj = self._num_units if self._num_proj is None else self._num_proj
+    sigmoid = math_ops.sigmoid
+
+    (c_prev, m_prev) = state
+
+    dtype = inputs.dtype
+    input_size = inputs.get_shape().with_rank(2)[1]
+    if input_size.value is None:
+      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
+    scope = vs.get_variable_scope()
+    with vs.variable_scope(scope, initializer=self._initializer) as unit_scope:
+
+      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+      lstm_matrix = self._linear([inputs, m_prev], 4 * self._num_units, bias=True,
+                            bias_initializer=None, layer_norm=self._layer_norm)
+      i, j, f, o = array_ops.split(
+        value=lstm_matrix, num_or_size_splits=4, axis=1)
+
+      if self._layer_norm:
+        i = _norm(self._norm_gain, self._norm_shift, i, "input")
+        j = _norm(self._norm_gain, self._norm_shift, j, "transform")
+        f = _norm(self._norm_gain, self._norm_shift, f, "forget")
+        o = _norm(self._norm_gain, self._norm_shift, o, "output")
+
+      # Diagonal connections
+      if self._use_peepholes:
+        with vs.variable_scope(unit_scope) as projection_scope:
+          w_f_diag = vs.get_variable(
+            "w_f_diag", shape=[self._num_units], dtype=dtype)
+          w_i_diag = vs.get_variable(
+            "w_i_diag", shape=[self._num_units], dtype=dtype)
+          w_o_diag = vs.get_variable(
+            "w_o_diag", shape=[self._num_units], dtype=dtype)
+
+      if self._use_peepholes:
+        c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
+             sigmoid(i + w_i_diag * c_prev) * self._activation(j))
+      else:
+        c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) *
+             self._activation(j))
+
+      if self._layer_norm:
+        c = _norm(self._norm_gain, self._norm_shift, c, "state")
+
+      if self._cell_clip is not None:
+        # pylint: disable=invalid-unary-operand-type
+        c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
+        # pylint: enable=invalid-unary-operand-type
+      if self._use_peepholes:
+        m = sigmoid(o + w_o_diag * c) * self._activation(c)
+      else:
+        m = sigmoid(o) * self._activation(c)
+
+      if self._num_proj is not None:
+        with vs.variable_scope("projection") as proj_scope:
+          m = self._linear(m, self._num_proj, bias=False)
+
+        if self._proj_clip is not None:
+          # pylint: disable=invalid-unary-operand-type
+          m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
+          # pylint: enable=invalid-unary-operand-type
+
+    new_state = (rnn_cell_impl.LSTMStateTuple(c, m))
+    return m, new_state
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index 01a5540121..91493302b1 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -33,7 +33,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import variable_scope as vs
@@ -590,24 +589,6 @@ class AttentionWrapperTest(test.TestCase):
         expected_final_alignment_history=expected_final_alignment_history,
         name='testBahdanauMonotonicNormalized')
 
-  def testBahdanauMonotonicHard(self):
-    # Run attention mechanism with mode='hard', make sure probabilities are hard
-    b, t, u, d = 10, 20, 30, 40
-    with self.test_session(use_gpu=True) as sess:
-      a = wrapper.BahdanauMonotonicAttention(
-          d,
-          random_ops.random_normal((b, t, u)),
-          mode='hard')
-      # Just feed previous attention as [1, 0, 0, ...]
-      attn = a(random_ops.random_normal((b, d)), array_ops.one_hot([0]*b, t))
-      sess.run(variables.global_variables_initializer())
-      attn_out = attn.eval()
-      # All values should be 0 or 1
-      self.assertTrue(np.all(np.logical_or(attn_out == 0, attn_out == 1)))
-      # Sum of distributions should be 0 or 1 (0 when all p_choose_i are 0)
-      self.assertTrue(np.all(np.logical_or(attn_out.sum(axis=1) == 1,
-                                           attn_out.sum(axis=1) == 0)))
-
   def testLuongMonotonicNotNormalized(self):
     create_attention_mechanism = functools.partial(
         wrapper.LuongMonotonicAttention, sigmoid_noise=1.0,
@@ -714,24 +695,6 @@ class AttentionWrapperTest(test.TestCase):
         expected_final_alignment_history=expected_final_alignment_history,
         name='testMultiAttention')
 
-  def testLuongMonotonicHard(self):
-    # Run attention mechanism with mode='hard', make sure probabilities are hard
-    b, t, u, d = 10, 20, 30, 40
-    with self.test_session(use_gpu=True) as sess:
-      a = wrapper.LuongMonotonicAttention(
-          d,
-          random_ops.random_normal((b, t, u)),
-          mode='hard')
-      # Just feed previous attention as [1, 0, 0, ...]
-      attn = a(random_ops.random_normal((b, d)), array_ops.one_hot([0]*b, t))
-      sess.run(variables.global_variables_initializer())
-      attn_out = attn.eval()
-      # All values should be 0 or 1
-      self.assertTrue(np.all(np.logical_or(attn_out == 0, attn_out == 1)))
-      # Sum of distributions should be 0 or 1 (0 when all p_choose_i are 0)
-      self.assertTrue(np.all(np.logical_or(attn_out.sum(axis=1) == 1,
-                                           attn_out.sum(axis=1) == 0)))
-
   def testMultiAttentionNoAttentionLayer(self):
     create_attention_mechanisms = (
         wrapper.BahdanauAttention, wrapper.LuongAttention)
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 87230e3355..0c64c9caf1 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -149,7 +149,7 @@ class _BaseAttentionMechanism(AttentionMechanism):
                memory_sequence_length=None,
                memory_layer=None,
                check_inner_dims_defined=True,
-               score_mask_value=float("-inf"),
+               score_mask_value=None,
                name=None):
     """Construct base AttentionMechanism class.
 
@@ -187,9 +187,12 @@ class _BaseAttentionMechanism(AttentionMechanism):
           "memory_layer is not a Layer: %s" % type(memory_layer).__name__)
     self._query_layer = query_layer
     self._memory_layer = memory_layer
+    self.dtype = memory_layer.dtype
     if not callable(probability_fn):
       raise TypeError("probability_fn must be callable, saw type: %s" %
                       type(probability_fn).__name__)
+    if score_mask_value is None:
+      score_mask_value = dtypes.as_dtype(self._memory_layer.dtype).as_numpy_dtype(-np.inf)
     self._probability_fn = lambda score, prev: (  # pylint:disable=g-long-lambda
         probability_fn(
             _maybe_mask_score(score, memory_sequence_length, score_mask_value),
@@ -334,7 +337,8 @@ class LuongAttention(_BaseAttentionMechanism):
                memory_sequence_length=None,
                scale=False,
                probability_fn=None,
-               score_mask_value=float("-inf"),
+               score_mask_value=None,
+               dtype=None,
                name="LuongAttention"):
     """Construct the AttentionMechanism mechanism.
 
@@ -353,17 +357,20 @@ class LuongAttention(_BaseAttentionMechanism):
       score_mask_value: (optional) The mask value for score before passing into
         `probability_fn`. The default is -inf. Only used if
         `memory_sequence_length` is not None.
+      dtype: The data type for the memory layer of the attention mechanism.
       name: Name to use when creating ops.
     """
     # For LuongAttention, we only transform the memory layer; thus
     # num_units **must** match expected the query depth.
     if probability_fn is None:
       probability_fn = nn_ops.softmax
+    if dtype is None:
+      dtype = dtypes.float32
     wrapped_probability_fn = lambda score, _: probability_fn(score)
     super(LuongAttention, self).__init__(
         query_layer=None,
         memory_layer=layers_core.Dense(
-            num_units, name="memory_layer", use_bias=False),
+            num_units, name="memory_layer", use_bias=False, dtype=dtype),
         memory=memory,
         probability_fn=wrapped_probability_fn,
         memory_sequence_length=memory_sequence_length,
@@ -475,7 +482,8 @@ class BahdanauAttention(_BaseAttentionMechanism):
                memory_sequence_length=None,
                normalize=False,
                probability_fn=None,
-               score_mask_value=float("-inf"),
+               score_mask_value=None,
+               dtype=None,
                name="BahdanauAttention"):
     """Construct the Attention mechanism.
 
@@ -494,16 +502,20 @@ class BahdanauAttention(_BaseAttentionMechanism):
       score_mask_value: (optional): The mask value for score before passing into
         `probability_fn`. The default is -inf. Only used if
         `memory_sequence_length` is not None.
+      dtype: The data type for the query and memory layers of the attention
+        mechanism.
       name: Name to use when creating ops.
     """
     if probability_fn is None:
       probability_fn = nn_ops.softmax
+    if dtype is None:
+      dtype = dtypes.float32
     wrapped_probability_fn = lambda score, _: probability_fn(score)
     super(BahdanauAttention, self).__init__(
         query_layer=layers_core.Dense(
-            num_units, name="query_layer", use_bias=False),
+            num_units, name="query_layer", use_bias=False, dtype=dtype),
         memory_layer=layers_core.Dense(
-            num_units, name="memory_layer", use_bias=False),
+            num_units, name="memory_layer", use_bias=False, dtype=dtype),
         memory=memory,
         probability_fn=wrapped_probability_fn,
         memory_sequence_length=memory_sequence_length,
@@ -679,11 +691,7 @@ def _monotonic_probability_fn(score, previous_alignments, sigmoid_noise, mode,
                                      seed=seed)
     score += sigmoid_noise*noise
   # Compute "choosing" probabilities from the attention scores
-  if mode == "hard":
-    # When mode is hard, use a hard sigmoid
-    p_choose_i = math_ops.cast(score > 0, score.dtype)
-  else:
-    p_choose_i = math_ops.sigmoid(score)
+  p_choose_i = math_ops.sigmoid(score)
   # Convert from choosing probabilities to attention distribution
   return monotonic_attention(p_choose_i, previous_alignments, mode)
 
@@ -738,11 +746,12 @@ class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
                memory,
                memory_sequence_length=None,
                normalize=False,
-               score_mask_value=float("-inf"),
+               score_mask_value=None,
                sigmoid_noise=0.,
                sigmoid_noise_seed=None,
                score_bias_init=0.,
                mode="parallel",
+               dtype=None,
                name="BahdanauMonotonicAttention"):
     """Construct the Attention mechanism.
 
@@ -766,17 +775,21 @@ class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
       mode: How to compute the attention distribution.  Must be one of
         'recursive', 'parallel', or 'hard'.  See the docstring for
         `tf.contrib.seq2seq.monotonic_attention` for more information.
+      dtype: The data type for the query and memory layers of the attention
+        mechanism.
       name: Name to use when creating ops.
     """
     # Set up the monotonic probability fn with supplied parameters
+    if dtype is None:
+      dtype = dtypes.float32
     wrapped_probability_fn = functools.partial(
         _monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode,
         seed=sigmoid_noise_seed)
     super(BahdanauMonotonicAttention, self).__init__(
         query_layer=layers_core.Dense(
-            num_units, name="query_layer", use_bias=False),
+            num_units, name="query_layer", use_bias=False, dtype=dtype),
         memory_layer=layers_core.Dense(
-            num_units, name="memory_layer", use_bias=False),
+            num_units, name="memory_layer", use_bias=False, dtype=dtype),
         memory=memory,
         probability_fn=wrapped_probability_fn,
         memory_sequence_length=memory_sequence_length,
@@ -834,11 +847,12 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
                memory,
                memory_sequence_length=None,
                scale=False,
-               score_mask_value=float("-inf"),
+               score_mask_value=None,
                sigmoid_noise=0.,
                sigmoid_noise_seed=None,
                score_bias_init=0.,
                mode="parallel",
+               dtype=None,
                name="LuongMonotonicAttention"):
     """Construct the Attention mechanism.
 
@@ -862,17 +876,21 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
       mode: How to compute the attention distribution.  Must be one of
         'recursive', 'parallel', or 'hard'.  See the docstring for
         `tf.contrib.seq2seq.monotonic_attention` for more information.
+      dtype: The data type for the query and memory layers of the attention
+        mechanism.
       name: Name to use when creating ops.
     """
     # Set up the monotonic probability fn with supplied parameters
+    if dtype is None:
+      dtype = dtypes.float32
     wrapped_probability_fn = functools.partial(
         _monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode,
         seed=sigmoid_noise_seed)
     super(LuongMonotonicAttention, self).__init__(
         query_layer=layers_core.Dense(
-            num_units, name="query_layer", use_bias=False),
+            num_units, name="query_layer", use_bias=False, dtype=dtype),
         memory_layer=layers_core.Dense(
-            num_units, name="memory_layer", use_bias=False),
+            num_units, name="memory_layer", use_bias=False, dtype=dtype),
         memory=memory,
         probability_fn=wrapped_probability_fn,
         memory_sequence_length=memory_sequence_length,
@@ -1123,8 +1141,9 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
             % (len(attention_layer_sizes), len(attention_mechanisms)))
       self._attention_layers = tuple(
           layers_core.Dense(
-              attention_layer_size, name="attention_layer", use_bias=False)
-          for attention_layer_size in attention_layer_sizes)
+              attention_layer_size, name="attention_layer", use_bias=False,
+              dtype=attention_mechanisms[i].dtype)
+          for i, attention_layer_size in enumerate(attention_layer_sizes))
       self._attention_layer_size = sum(attention_layer_sizes)
     else:
       self._attention_layers = None
diff --git a/tensorflow/contrib/slim/BUILD b/tensorflow/contrib/slim/BUILD
index c2f106c2b2..23c23af2f4 100644
--- a/tensorflow/contrib/slim/BUILD
+++ b/tensorflow/contrib/slim/BUILD
@@ -39,8 +39,6 @@ py_test(
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
-        "//tensorflow/python/debug:debug_data",
-        "//tensorflow/python/debug:hooks",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index 0bfd0801d5..f7a85557ca 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -237,7 +237,7 @@ One way to reduce this code duplication would be via a `for` loop:
 ```python
 net = ...
 for i in range(3):
-  net = slim.conv2d(net, 256, [3, 3], scope='conv3_' % (i+1))
+  net = slim.conv2d(net, 256, [3, 3], scope='conv3_%d' % (i+1))
 net = slim.max_pool2d(net, [2, 2], scope='pool2')
 ```
 
diff --git a/tensorflow/contrib/slim/python/slim/evaluation.py b/tensorflow/contrib/slim/python/slim/evaluation.py
index cdb720b36b..2d4b08df61 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation.py
@@ -153,8 +153,7 @@ def evaluate_once(master,
                   summary_op=_USE_DEFAULT,
                   summary_op_feed_dict=None,
                   variables_to_restore=None,
-                  session_config=None,
-                  hooks=None):
+                  session_config=None):
   """Evaluates the model at the given checkpoint path.
 
   Args:
@@ -178,8 +177,6 @@ def evaluate_once(master,
       slim.variables.GetVariablesToRestore() is used.
     session_config: An instance of `tf.ConfigProto` that will be used to
       configure the `Session`. If left as `None`, the default will be used.
-    hooks: A list of additional `SessionRunHook` objects to pass during the
-      evaluation.
 
   Returns:
     The value of `final_op` or `None` if `final_op` is `None`.
@@ -187,13 +184,11 @@ def evaluate_once(master,
   if summary_op == _USE_DEFAULT:
     summary_op = summary.merge_all()
 
-  all_hooks = [evaluation.StopAfterNEvalsHook(num_evals),]
+  hooks = [evaluation.StopAfterNEvalsHook(num_evals),]
 
   if summary_op is not None:
-    all_hooks.append(evaluation.SummaryAtEndHook(
+    hooks.append(evaluation.SummaryAtEndHook(
         log_dir=logdir, summary_op=summary_op, feed_dict=summary_op_feed_dict))
-  if hooks is not None:
-    all_hooks.extend(hooks)
 
   saver = None
   if variables_to_restore is not None:
@@ -208,7 +203,7 @@ def evaluate_once(master,
       feed_dict=eval_op_feed_dict,
       final_ops=final_op,
       final_ops_feed_dict=final_op_feed_dict,
-      hooks=all_hooks,
+      hooks=hooks,
       config=session_config)
 
 
@@ -261,7 +256,7 @@ def evaluation_loop(master,
       configure the `Session`. If left as `None`, the default will be used.
     timeout: The maximum amount of time to wait between checkpoints. If left as
       `None`, then the process will wait indefinitely.
-    hooks: A list of additional `SessionRunHook` objects to pass during
+    hooks: A list of additional SessionRunHook objects to pass during
       repeated evaluations.
 
   Returns:
diff --git a/tensorflow/contrib/slim/python/slim/evaluation_test.py b/tensorflow/contrib/slim/python/slim/evaluation_test.py
index 870f504d10..d9e0f54b72 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import glob
 import os
-import shutil
 import time
 
 import numpy as np
@@ -30,8 +29,6 @@ from tensorflow.contrib.metrics.python.ops import metric_ops
 from tensorflow.contrib.slim.python.slim import evaluation
 from tensorflow.contrib.training.python.training import evaluation as evaluation_lib
 from tensorflow.core.protobuf import saver_pb2
-from tensorflow.python.debug.lib import debug_data
-from tensorflow.python.debug.wrappers import hooks
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -233,7 +230,11 @@ class SingleEvaluationTest(test.TestCase):
     with self.assertRaises(errors.NotFoundError):
       evaluation.evaluate_once('', checkpoint_path, log_dir)
 
-  def _prepareCheckpoint(self, checkpoint_path):
+  def testRestoredModelPerformance(self):
+    checkpoint_path = os.path.join(self.get_temp_dir(), 'model.ckpt')
+    log_dir = os.path.join(self.get_temp_dir(), 'log_dir1/')
+
+    # First, save out the current model to a checkpoint:
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
     saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V1)
@@ -241,13 +242,6 @@ class SingleEvaluationTest(test.TestCase):
       sess.run(init_op)
       saver.save(sess, checkpoint_path)
 
-  def testRestoredModelPerformance(self):
-    checkpoint_path = os.path.join(self.get_temp_dir(), 'model.ckpt')
-    log_dir = os.path.join(self.get_temp_dir(), 'log_dir1/')
-
-    # First, save out the current model to a checkpoint:
-    self._prepareCheckpoint(checkpoint_path)
-
     # Next, determine the metric to evaluate:
     value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
                                                         self._labels)
@@ -257,36 +251,6 @@ class SingleEvaluationTest(test.TestCase):
         '', checkpoint_path, log_dir, eval_op=update_op, final_op=value_op)
     self.assertAlmostEqual(accuracy_value, self._expected_accuracy)
 
-  def testAdditionalHooks(self):
-    checkpoint_path = os.path.join(self.get_temp_dir(), 'model.ckpt')
-    log_dir = os.path.join(self.get_temp_dir(), 'log_dir1/')
-
-    # First, save out the current model to a checkpoint:
-    self._prepareCheckpoint(checkpoint_path)
-
-    # Next, determine the metric to evaluate:
-    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
-                                                        self._labels)
-
-    dumping_root = os.path.join(self.get_temp_dir(), 'tfdbg_dump_dir')
-    dumping_hook = hooks.DumpingDebugHook(dumping_root, log_usage=False)
-    try:
-      # Run the evaluation and verify the results:
-      accuracy_value = evaluation.evaluate_once(
-          '', checkpoint_path, log_dir, eval_op=update_op, final_op=value_op,
-          hooks=[dumping_hook])
-      self.assertAlmostEqual(accuracy_value, self._expected_accuracy)
-
-      dump = debug_data.DebugDumpDir(
-          glob.glob(os.path.join(dumping_root, 'run_*'))[0])
-      # Here we simply assert that the dumped data has been loaded and is
-      # non-empty. We do not care about the detailed model-internal tensors or
-      # their values.
-      self.assertTrue(dump.dumped_tensor_data)
-    finally:
-      if os.path.isdir(dumping_root):
-        shutil.rmtree(dumping_root)
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py b/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py
index b4fd2580c2..576444214d 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py
@@ -386,7 +386,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
                 inputs, None, is_training=False, global_pool=False)
             sess.run(variables.global_variables_initializer())
             self.assertAllClose(
-                output.eval(), expected.eval(), atol=1e-4, rtol=1e-4)
+                output.eval(), expected.eval(), atol=2e-4, rtol=1e-4)
 
   def testUnknownBatchSize(self):
     batch = 2
diff --git a/tensorflow/contrib/summary/BUILD b/tensorflow/contrib/summary/BUILD
index 3c60d2bb56..da23f1c380 100644
--- a/tensorflow/contrib/summary/BUILD
+++ b/tensorflow/contrib/summary/BUILD
@@ -26,18 +26,12 @@ py_test(
     deps = [
         ":summary_ops",
         ":summary_test_util",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:ops",
         "//tensorflow/python:platform",
-        "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/eager:test",
-        "@six_archive//:six",
     ],
 )
 
diff --git a/tensorflow/contrib/summary/summary.py b/tensorflow/contrib/summary/summary.py
index 813e8b2b09..ca82ea094c 100644
--- a/tensorflow/contrib/summary/summary.py
+++ b/tensorflow/contrib/summary/summary.py
@@ -28,13 +28,11 @@ from __future__ import print_function
 from tensorflow.contrib.summary.summary_ops import all_summary_ops
 from tensorflow.contrib.summary.summary_ops import always_record_summaries
 from tensorflow.contrib.summary.summary_ops import audio
-from tensorflow.contrib.summary.summary_ops import create_summary_db_writer
 from tensorflow.contrib.summary.summary_ops import create_summary_file_writer
 from tensorflow.contrib.summary.summary_ops import eval_dir
 from tensorflow.contrib.summary.summary_ops import generic
 from tensorflow.contrib.summary.summary_ops import histogram
 from tensorflow.contrib.summary.summary_ops import image
-from tensorflow.contrib.summary.summary_ops import import_event
 from tensorflow.contrib.summary.summary_ops import never_record_summaries
 from tensorflow.contrib.summary.summary_ops import record_summaries_every_n_global_steps
 from tensorflow.contrib.summary.summary_ops import scalar
diff --git a/tensorflow/contrib/summary/summary_ops.py b/tensorflow/contrib/summary/summary_ops.py
index f6be99f6ae..56e3198593 100644
--- a/tensorflow/contrib/summary/summary_ops.py
+++ b/tensorflow/contrib/summary/summary_ops.py
@@ -19,12 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import getpass
 import os
-import re
-import time
-
-import six
 
 from tensorflow.contrib.summary import gen_summary_ops
 from tensorflow.python.eager import context
@@ -47,10 +42,6 @@ _SHOULD_RECORD_SUMMARIES_NAME = "ShouldRecordSummaries"
 _SUMMARY_COLLECTION_NAME = "_SUMMARY_V2"
 _SUMMARY_WRITER_INIT_COLLECTION_NAME = "_SUMMARY_WRITER_V2"
 
-_EXPERIMENT_NAME_PATTERNS = re.compile(r"^[^\x00-\x1F<>]{0,256}$")
-_RUN_NAME_PATTERNS = re.compile(r"^[^\x00-\x1F<>]{0,512}$")
-_USER_NAME_PATTERNS = re.compile(r"^[a-z]([-a-z0-9]{0,29}[a-z0-9])?$", re.I)
-
 
 def should_record_summaries():
   """Returns boolean Tensor which is true if summaries should be recorded."""
@@ -66,14 +57,12 @@ def should_record_summaries():
 
 # TODO(apassos) consider how to handle local step here.
 @tf_contextlib.contextmanager
-def record_summaries_every_n_global_steps(n, global_step=None):
+def record_summaries_every_n_global_steps(n):
   """Sets the should_record_summaries Tensor to true if global_step % n == 0."""
-  if global_step is None:
-    global_step = training_util.get_global_step()
   collection_ref = ops.get_collection_ref(_SHOULD_RECORD_SUMMARIES_NAME)
   old = collection_ref[:]
   with ops.device("cpu:0"):
-    collection_ref[:] = [math_ops.equal(global_step % n, 0)]
+    collection_ref[:] = [math_ops.equal(training_util.get_global_step() % n, 0)]
   yield
   collection_ref[:] = old
 
@@ -141,8 +130,7 @@ def create_summary_file_writer(logdir,
      flush once the queue gets bigger than this.
     flush_millis: the largest interval between flushes.
     filename_suffix: optional suffix for the event file name.
-    name: Shared name for this SummaryWriter resource stored to default
-      Graph.
+    name: name for the summary writer.
 
   Returns:
     Either a summary writer or an empty object which can be used as a
@@ -157,81 +145,14 @@ def create_summary_file_writer(logdir,
       flush_millis = constant_op.constant(2 * 60 * 1000)
     if filename_suffix is None:
       filename_suffix = constant_op.constant("")
-    return _make_summary_writer(
-        name,
-        gen_summary_ops.create_summary_file_writer,
-        logdir=logdir,
-        max_queue=max_queue,
-        flush_millis=flush_millis,
-        filename_suffix=filename_suffix)
-
-
-def create_summary_db_writer(db_uri,
-                             experiment_name=None,
-                             run_name=None,
-                             user_name=None,
-                             name=None):
-  """Creates a summary database writer in the current context.
-
-  This can be used to write tensors from the execution graph directly
-  to a database. Only SQLite is supported right now. This function
-  will create the schema if it doesn't exist. Entries in the Users,
-  Experiments, and Runs tables will be created automatically if they
-  don't already exist.
-
-  Args:
-    db_uri: For example "file:/tmp/foo.sqlite".
-    experiment_name: Defaults to YYYY-MM-DD in local time if None.
-      Empty string means the Run will not be associated with an
-      Experiment. Can't contain ASCII control characters or <>. Case
-      sensitive.
-    run_name: Defaults to HH:MM:SS in local time if None. Empty string
-      means a Tag will not be associated with any Run. Can't contain
-      ASCII control characters or <>. Case sensitive.
-    user_name: Defaults to system username if None. Empty means the
-      Experiment will not be associated with a User. Must be valid as
-      both a DNS label and Linux username.
-    name: Shared name for this SummaryWriter resource stored to default
-      Graph.
-
-  Returns:
-    A new SummaryWriter instance.
-  """
-  with ops.device("cpu:0"):
-    if experiment_name is None:
-      experiment_name = time.strftime("%Y-%m-%d", time.localtime(time.time()))
-    if run_name is None:
-      run_name = time.strftime("%H:%M:%S", time.localtime(time.time()))
-    if user_name is None:
-      user_name = getpass.getuser()
-    experiment_name = _cleanse_string(
-        "experiment_name", _EXPERIMENT_NAME_PATTERNS, experiment_name)
-    run_name = _cleanse_string("run_name", _RUN_NAME_PATTERNS, run_name)
-    user_name = _cleanse_string("user_name", _USER_NAME_PATTERNS, user_name)
-    return _make_summary_writer(
-        name,
-        gen_summary_ops.create_summary_db_writer,
-        db_uri=db_uri,
-        experiment_name=experiment_name,
-        run_name=run_name,
-        user_name=user_name)
-
-
-def _make_summary_writer(name, factory, **kwargs):
-  resource = gen_summary_ops.summary_writer(shared_name=name)
-  # TODO(apassos): Consider doing this instead.
-  # node = factory(resource, **kwargs)
-  # if not context.in_eager_mode():
-  #   ops.get_default_session().run(node)
-  ops.add_to_collection(_SUMMARY_WRITER_INIT_COLLECTION_NAME,
-                        factory(resource, **kwargs))
-  return SummaryWriter(resource)
-
-
-def _cleanse_string(name, pattern, value):
-  if isinstance(value, six.string_types) and pattern.search(value) is None:
-    raise ValueError("%s (%s) must match %s" % (name, value, pattern.pattern))
-  return ops.convert_to_tensor(value, dtypes.string)
+    resource = gen_summary_ops.summary_writer(shared_name=name)
+    # TODO(apassos) ensure the initialization op runs when in graph mode;
+    # consider calling session.run here.
+    ops.add_to_collection(
+        _SUMMARY_WRITER_INIT_COLLECTION_NAME,
+        gen_summary_ops.create_summary_file_writer(
+            resource, logdir, max_queue, flush_millis, filename_suffix))
+    return SummaryWriter(resource)
 
 
 def _nothing():
@@ -283,81 +204,68 @@ def summary_writer_function(name, tensor, function, family=None):
   return op
 
 
-def generic(name, tensor, metadata=None, family=None, global_step=None):
+def generic(name, tensor, metadata, family=None):
   """Writes a tensor summary if possible."""
-  if global_step is None:
-    global_step = training_util.get_global_step()
+
   def function(tag, scope):
-    if metadata is None:
-      serialized_metadata = constant_op.constant("")
-    elif hasattr(metadata, "SerializeToString"):
-      serialized_metadata = constant_op.constant(metadata.SerializeToString())
-    else:
-      serialized_metadata = metadata
     # Note the identity to move the tensor to the CPU.
     return gen_summary_ops.write_summary(
         context.context().summary_writer_resource,
-        global_step, array_ops.identity(tensor),
-        tag, serialized_metadata, name=scope)
+        training_util.get_global_step(), array_ops.identity(tensor),
+        tag, metadata, name=scope)
   return summary_writer_function(name, tensor, function, family=family)
 
 
-def scalar(name, tensor, family=None, global_step=None):
+def scalar(name, tensor, family=None):
   """Writes a scalar summary if possible."""
-  if global_step is None:
-    global_step = training_util.get_global_step()
+
   def function(tag, scope):
     # Note the identity to move the tensor to the CPU.
     return gen_summary_ops.write_scalar_summary(
         context.context().summary_writer_resource,
-        global_step, tag, array_ops.identity(tensor),
+        training_util.get_global_step(), tag, array_ops.identity(tensor),
         name=scope)
 
   return summary_writer_function(name, tensor, function, family=family)
 
 
-def histogram(name, tensor, family=None, global_step=None):
+def histogram(name, tensor, family=None):
   """Writes a histogram summary if possible."""
-  if global_step is None:
-    global_step = training_util.get_global_step()
+
   def function(tag, scope):
     # Note the identity to move the tensor to the CPU.
     return gen_summary_ops.write_histogram_summary(
         context.context().summary_writer_resource,
-        global_step, tag, array_ops.identity(tensor),
+        training_util.get_global_step(), tag, array_ops.identity(tensor),
         name=scope)
 
   return summary_writer_function(name, tensor, function, family=family)
 
 
-def image(name, tensor, bad_color=None, max_images=3, family=None,
-          global_step=None):
+def image(name, tensor, bad_color=None, max_images=3, family=None):
   """Writes an image summary if possible."""
-  if global_step is None:
-    global_step = training_util.get_global_step()
+
   def function(tag, scope):
     bad_color_ = (constant_op.constant([255, 0, 0, 255], dtype=dtypes.uint8)
                   if bad_color is None else bad_color)
     # Note the identity to move the tensor to the CPU.
     return gen_summary_ops.write_image_summary(
         context.context().summary_writer_resource,
-        global_step, tag, array_ops.identity(tensor),
+        training_util.get_global_step(), tag, array_ops.identity(tensor),
         bad_color_,
         max_images, name=scope)
 
   return summary_writer_function(name, tensor, function, family=family)
 
 
-def audio(name, tensor, sample_rate, max_outputs, family=None,
-          global_step=None):
+def audio(name, tensor, sample_rate, max_outputs, family=None):
   """Writes an audio summary if possible."""
-  if global_step is None:
-    global_step = training_util.get_global_step()
+
   def function(tag, scope):
     # Note the identity to move the tensor to the CPU.
     return gen_summary_ops.write_audio_summary(
         context.context().summary_writer_resource,
-        global_step,
+        training_util.get_global_step(),
         tag,
         array_ops.identity(tensor),
         sample_rate=sample_rate,
@@ -367,26 +275,6 @@ def audio(name, tensor, sample_rate, max_outputs, family=None,
   return summary_writer_function(name, tensor, function, family=family)
 
 
-def import_event(tensor, name=None):
-  """Writes a tf.Event binary proto.
-
-  When using create_summary_db_writer(), this can be used alongside
-  tf.TFRecordReader to load event logs into the database. Please note
-  that this is lower level than the other summary functions and will
-  ignore any conditions set by methods like should_record_summaries().
-
-  Args:
-    tensor: A `Tensor` of type `string` containing a serialized `Event`
-      proto.
-    name: A name for the operation (optional).
-
-  Returns:
-    The created Operation.
-  """
-  return gen_summary_ops.import_event(
-      context.context().summary_writer_resource, tensor, name=name)
-
-
 def eval_dir(model_dir, name=None):
   """Construct a logdir for an eval summary writer."""
   return os.path.join(model_dir, "eval" if not name else "eval_" + name)
diff --git a/tensorflow/contrib/summary/summary_ops_test.py b/tensorflow/contrib/summary/summary_ops_test.py
index 6e1a746815..de7ae6ec27 100644
--- a/tensorflow/contrib/summary/summary_ops_test.py
+++ b/tensorflow/contrib/summary/summary_ops_test.py
@@ -17,22 +17,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
-import os
 import tempfile
 
-import six
-import sqlite3
-
 from tensorflow.contrib.summary import summary_ops
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.training import training_util
 
@@ -94,120 +86,6 @@ class TargetTest(test_util.TensorFlowTestCase):
       self.assertEqual(len(events), 2)
       self.assertEqual(events[1].summary.value[0].tag, 'scalar')
 
-  def testSummaryGlobalStep(self):
-    global_step = training_util.get_or_create_global_step()
-    logdir = tempfile.mkdtemp()
-    with summary_ops.create_summary_file_writer(
-        logdir, max_queue=0,
-        name='t2').as_default(), summary_ops.always_record_summaries():
-
-      summary_ops.scalar('scalar', 2.0, global_step=global_step)
-
-      events = summary_test_util.events_from_file(logdir)
-      self.assertEqual(len(events), 2)
-      self.assertEqual(events[1].summary.value[0].tag, 'scalar')
-
-
-class DbTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self.db_path = os.path.join(self.get_temp_dir(), 'DbTest.sqlite')
-    if os.path.exists(self.db_path):
-      os.unlink(self.db_path)
-    self.db = sqlite3.connect(self.db_path)
-    self.create_summary_db_writer = functools.partial(
-        summary_ops.create_summary_db_writer,
-        db_uri=self.db_path,
-        experiment_name='experiment',
-        run_name='run',
-        user_name='user')
-
-  def tearDown(self):
-    self.db.close()
-
-  def testIntegerSummaries(self):
-    step = training_util.create_global_step()
-
-    def adder(x, y):
-      state_ops.assign_add(step, 1)
-      summary_ops.generic('x', x)
-      summary_ops.generic('y', y)
-      sum_ = x + y
-      summary_ops.generic('sum', sum_)
-      return sum_
-
-    with summary_ops.always_record_summaries():
-      with self.create_summary_db_writer().as_default():
-        self.assertEqual(5, adder(int64(2), int64(3)).numpy())
-
-    six.assertCountEqual(self, [1, 1, 1],
-                         get_all(self.db, 'SELECT step FROM Tensors'))
-    six.assertCountEqual(self, ['x', 'y', 'sum'],
-                         get_all(self.db, 'SELECT tag_name FROM Tags'))
-    x_id = get_one(self.db, 'SELECT tag_id FROM Tags WHERE tag_name = "x"')
-    y_id = get_one(self.db, 'SELECT tag_id FROM Tags WHERE tag_name = "y"')
-    sum_id = get_one(self.db, 'SELECT tag_id FROM Tags WHERE tag_name = "sum"')
-
-    with summary_ops.always_record_summaries():
-      with self.create_summary_db_writer().as_default():
-        self.assertEqual(9, adder(int64(4), int64(5)).numpy())
-
-    six.assertCountEqual(self, [1, 1, 1, 2, 2, 2],
-                         get_all(self.db, 'SELECT step FROM Tensors'))
-    six.assertCountEqual(self, [x_id, y_id, sum_id],
-                         get_all(self.db, 'SELECT tag_id FROM Tags'))
-    self.assertEqual(2, get_tensor(self.db, x_id, 1))
-    self.assertEqual(3, get_tensor(self.db, y_id, 1))
-    self.assertEqual(5, get_tensor(self.db, sum_id, 1))
-    self.assertEqual(4, get_tensor(self.db, x_id, 2))
-    self.assertEqual(5, get_tensor(self.db, y_id, 2))
-    self.assertEqual(9, get_tensor(self.db, sum_id, 2))
-    six.assertCountEqual(
-        self, ['experiment'],
-        get_all(self.db, 'SELECT experiment_name FROM Experiments'))
-    six.assertCountEqual(self, ['run'],
-                         get_all(self.db, 'SELECT run_name FROM Runs'))
-    six.assertCountEqual(self, ['user'],
-                         get_all(self.db, 'SELECT user_name FROM Users'))
-
-  def testBadExperimentName(self):
-    with self.assertRaises(ValueError):
-      self.create_summary_db_writer(experiment_name='\0')
-
-  def testBadRunName(self):
-    with self.assertRaises(ValueError):
-      self.create_summary_db_writer(run_name='\0')
-
-  def testBadUserName(self):
-    with self.assertRaises(ValueError):
-      self.create_summary_db_writer(user_name='-hi')
-    with self.assertRaises(ValueError):
-      self.create_summary_db_writer(user_name='hi-')
-    with self.assertRaises(ValueError):
-      self.create_summary_db_writer(user_name='@')
-
-
-def get_one(db, q, *p):
-  return db.execute(q, p).fetchone()[0]
-
-
-def get_all(db, q, *p):
-  return unroll(db.execute(q, p).fetchall())
-
-
-def get_tensor(db, tag_id, step):
-  return get_one(
-      db, 'SELECT tensor FROM Tensors WHERE tag_id = ? AND step = ?', tag_id,
-      step)
-
-
-def int64(x):
-  return array_ops.constant(x, dtypes.int64)
-
-
-def unroll(list_of_tuples):
-  return sum(list_of_tuples, ())
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/tensorboard/db/BUILD b/tensorflow/contrib/tensorboard/db/BUILD
index 068e862650..d8bbf87d2c 100644
--- a/tensorflow/contrib/tensorboard/db/BUILD
+++ b/tensorflow/contrib/tensorboard/db/BUILD
@@ -45,12 +45,10 @@ cc_library(
 
 tf_cc_test(
     name = "summary_db_writer_test",
-    size = "small",
     srcs = ["summary_db_writer_test.cc"],
     deps = [
         ":summary_db_writer",
         "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/lib/db:sqlite",
diff --git a/tensorflow/contrib/tensorboard/db/summary_db_writer.cc b/tensorflow/contrib/tensorboard/db/summary_db_writer.cc
index a26ad61660..df64e36305 100644
--- a/tensorflow/contrib/tensorboard/db/summary_db_writer.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_db_writer.cc
@@ -15,12 +15,10 @@ limitations under the License.
 #include "tensorflow/contrib/tensorboard/db/summary_db_writer.h"
 
 #include "tensorflow/contrib/tensorboard/db/schema.h"
-#include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/lib/db/sqlite.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/snappy.h"
-#include "tensorflow/core/util/event.pb.h"
 
 namespace tensorflow {
 namespace {
@@ -88,19 +86,13 @@ class SummaryDbWriter : public SummaryWriterInterface {
         TF_RETURN_IF_ERROR(BindTensor(t));
         break;
     }
-    return insert_tensor_.StepAndReset();
+    TF_RETURN_IF_ERROR(insert_tensor_.StepAndReset());
+    return Status::OK();
   }
 
   Status WriteEvent(std::unique_ptr<Event> e) override {
-    mutex_lock ml(mu_);
-    TF_RETURN_IF_ERROR(InitializeParents());
-    if (e->what_case() == Event::WhatCase::kSummary) {
-      const Summary& summary = e->summary();
-      for (int i = 0; i < summary.value_size(); ++i) {
-        TF_RETURN_IF_ERROR(WriteSummary(e.get(), summary.value(i)));
-      }
-    }
-    return Status::OK();
+    // TODO(@jart): This will be used to load event logs.
+    return errors::Unimplemented("WriteEvent");
   }
 
   Status WriteScalar(int64 global_step, Tensor t, const string& tag) override {
@@ -255,24 +247,6 @@ class SummaryDbWriter : public SummaryWriterInterface {
     return Status::OK();
   }
 
-  Status WriteSummary(const Event* e, const Summary::Value& summary)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    int64 tag_id;
-    TF_RETURN_IF_ERROR(GetTagId(run_id_, summary.tag(), &tag_id));
-    insert_tensor_.BindInt(1, tag_id);
-    insert_tensor_.BindInt(2, e->step());
-    insert_tensor_.BindDouble(3, e->wall_time());
-    switch (summary.value_case()) {
-      case Summary::Value::ValueCase::kSimpleValue:
-        insert_tensor_.BindDouble(4, summary.simple_value());
-        break;
-      default:
-        // TODO(@jart): Handle the rest.
-        return Status::OK();
-    }
-    return insert_tensor_.StepAndReset();
-  }
-
   mutex mu_;
   Env* env_;
   std::shared_ptr<Sqlite> db_ GUARDED_BY(mu_);
diff --git a/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc b/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
index c1af51e7b7..d32904f97c 100644
--- a/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
@@ -14,19 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/tensorboard/db/summary_db_writer.h"
 
-#include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/db/sqlite.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/util/event.pb.h"
 
 namespace tensorflow {
 namespace {
 
-const float kTolerance = 1e-5;
-
 Tensor MakeScalarInt64(int64 x) {
   Tensor t(DT_INT64, TensorShape({}));
   t.scalar<int64>()() = x;
@@ -46,7 +41,7 @@ class FakeClockEnv : public EnvWrapper {
 
 class SummaryDbWriterTest : public ::testing::Test {
  protected:
-  void SetUp() override { db_ = Sqlite::Open(":memory:").ValueOrDie(); }
+  void SetUp() override { db_ = Sqlite::Open("file::memory:").ValueOrDie(); }
 
   void TearDown() override {
     if (writer_ != nullptr) {
@@ -163,54 +158,5 @@ TEST_F(SummaryDbWriterTest, TensorsWritten_RowsGetInitialized) {
       QueryString("SELECT tensor FROM Tensors WHERE step = 2").empty());
 }
 
-TEST_F(SummaryDbWriterTest, EmptyParentNames_NoParentsCreated) {
-  TF_ASSERT_OK(CreateSummaryDbWriter(db_, "", "", "", &env_, &writer_));
-  TF_ASSERT_OK(writer_->WriteTensor(1, MakeScalarInt64(123LL), "taggy",
-                                    "this-is-metaaa"));
-  TF_ASSERT_OK(writer_->Flush());
-  ASSERT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Users"));
-  ASSERT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Experiments"));
-  ASSERT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Runs"));
-  ASSERT_EQ(1LL, QueryInt("SELECT COUNT(*) FROM Tags"));
-  ASSERT_EQ(1LL, QueryInt("SELECT COUNT(*) FROM Tensors"));
-}
-
-TEST_F(SummaryDbWriterTest, WriteEvent_Scalar) {
-  TF_ASSERT_OK(CreateSummaryDbWriter(db_, "", "", "", &env_, &writer_));
-  std::unique_ptr<Event> e{new Event};
-  e->set_step(7);
-  e->set_wall_time(123.456);
-  Summary::Value* s = e->mutable_summary()->add_value();
-  s->set_tag("π");
-  s->set_simple_value(3.14f);
-  s = e->mutable_summary()->add_value();
-  s->set_tag("φ");
-  s->set_simple_value(1.61f);
-  TF_ASSERT_OK(writer_->WriteEvent(std::move(e)));
-  TF_ASSERT_OK(writer_->Flush());
-  ASSERT_EQ(2LL, QueryInt("SELECT COUNT(*) FROM Tags"));
-  ASSERT_EQ(2LL, QueryInt("SELECT COUNT(*) FROM Tensors"));
-  int64 tag1_id = QueryInt("SELECT tag_id FROM Tags WHERE tag_name = 'π'");
-  int64 tag2_id = QueryInt("SELECT tag_id FROM Tags WHERE tag_name = 'φ'");
-  EXPECT_GT(tag1_id, 0LL);
-  EXPECT_GT(tag2_id, 0LL);
-  EXPECT_EQ(123.456, QueryDouble(strings::StrCat(
-                         "SELECT computed_time FROM Tensors WHERE tag_id = ",
-                         tag1_id, " AND step = 7")));
-  EXPECT_EQ(123.456, QueryDouble(strings::StrCat(
-                         "SELECT computed_time FROM Tensors WHERE tag_id = ",
-                         tag2_id, " AND step = 7")));
-  EXPECT_NEAR(3.14,
-              QueryDouble(strings::StrCat(
-                  "SELECT tensor FROM Tensors WHERE tag_id = ", tag1_id,
-                  " AND step = 7")),
-              kTolerance);  // Summary::simple_value is float
-  EXPECT_NEAR(1.61,
-              QueryDouble(strings::StrCat(
-                  "SELECT tensor FROM Tensors WHERE tag_id = ", tag2_id,
-                  " AND step = 7")),
-              kTolerance);
-}
-
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
index 916b9b3082..3965c087a1 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
@@ -45,7 +45,10 @@ class TPUConfig(
       is invoked once on each host. To be precise, with a global batch size
       `train_batch_size` in `TPUEstimator` constructor, the batch size for each
       shard is `train_batch_size` // #hosts. With Per-Core input pipeline
-      deployment, the shard batch size is `train_batch_size` // #cores.
+      deployment, the shard batch size is `train_batch_size` // #cores.  Note
+      that this only works for single-host TPU training now (tracked in
+      b/67051042). For multi-host, please use Per-Core, i.e., `False` for
+      `per_host_input_for_training`.
     tpu_job_name: The name of the TPU job. Typically, this name is auto-inferred
       within TPUEstimator, however when using ClusterSpec propagation in more
       esoteric cluster configurations, you may need to specify the job name as a
@@ -106,12 +109,3 @@ class RunConfig(run_config_lib.RunConfig):
   @property
   def tpu_config(self):
     return self._tpu_config
-
-  def replace(self, **kwargs):
-    if 'tpu_config' not in kwargs:
-      return super(RunConfig, self).replace(**kwargs)
-
-    tpu_config = kwargs.pop('tpu_config')
-    new_instance = super(RunConfig, self).replace(**kwargs)
-    new_instance._tpu_config = tpu_config  # pylint: disable=protected-access
-    return new_instance
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 07877fcc76..060b3f9129 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -66,7 +66,7 @@ _CROSS_REPLICA_SUM_OP = 'CrossReplicaSum'
 _RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY]
 
 # TODO(b/65703635): Flip the value and remove all dead code.
-_WRAP_INPUT_FN_INTO_WHILE_LOOP = False
+_WRAP_INPUT_FN_INTO_WHILE_LOOP = True
 
 
 def _create_global_step(graph):
@@ -232,10 +232,8 @@ class _TPUContext(object):
                          mode == model_fn_lib.ModeKeys.TRAIN
                          else self._eval_batch_size)
     # On TPU
-    if self.is_input_sharded_per_core():
-      return global_batch_size // self.num_cores
-    else:
-      return global_batch_size // self.num_hosts
+    return (global_batch_size // self.num_cores
+            if self.is_input_sharded_per_core() else global_batch_size)
 
   @property
   def batch_size_for_model_fn(self):
@@ -537,15 +535,13 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
           session, self._dequeue_ops)
 
   def before_run(self, run_context):
-    iterations = run_context.session.run(self._iterations_per_loop_var)
-
-    logging.info('Enqueue next (%d) batch(es) of data to infeed.', iterations)
+    logging.info('Enqueue next batch of data to infeed.')
 
+    iterations = run_context.session.run(self._iterations_per_loop_var)
     self._infeed_thd_controller.send_next_batch_signal(iterations)
     if self._dequeue_ops is not None:
       # TODO(xiejw): Refactor the outfeed dequeue into tf.while_loop.
-      logging.info(
-          'Dequeue next (%d) batch(es) of data from outfeed.', iterations)
+      logging.info('Dequeue next batch of data from outfeed.')
       self._outfeed_thd_controller.send_next_batch_signal(iterations)
 
   def end(self, session):
@@ -684,40 +680,6 @@ def generate_per_core_enqueue_ops_fn_for_host(
   return enqueue_ops_fn, (lambda: infeed_queue_holder['instance'])
 
 
-def generate_per_host_enqueue_ops_fn_for_host(
-    ctx, input_fn, inputs_structure_recorder, batch_axis, device):
-  """Generates infeed enqueue ops for per-host input_fn on a single host."""
-  infeed_queue_holder = {'instance': None}
-
-  def enqueue_ops_fn():
-    with ops.device(device):
-      num_cores_per_host = ctx.num_of_cores_per_host
-      inputs = input_fn()
-      if isinstance(inputs, tuple):
-        features, labels = inputs
-      else:
-        features, labels = inputs, None
-      inputs_structure_recorder.validate_and_record_structure(
-          features, labels)
-      unsharded_tensor_list = (
-          inputs_structure_recorder.flatten_features_and_labels(
-              features, labels))
-
-      infeed_queue = tpu_feed.InfeedQueue(
-          tuple_types=[t.dtype for t in unsharded_tensor_list],
-          tuple_shapes=[t.shape for t in unsharded_tensor_list],
-          shard_dimensions=batch_axis)
-      infeed_queue_holder['instance'] = infeed_queue
-      infeed_queue.set_number_of_shards(num_cores_per_host)
-
-      per_host_enqueue_ops = (
-          infeed_queue.split_inputs_and_generate_enqueue_ops(
-              unsharded_tensor_list,
-              placement_function=lambda x: device))
-      return per_host_enqueue_ops
-  return enqueue_ops_fn, (lambda: infeed_queue_holder['instance'])
-
-
 class _InputPipeline(object):
   """`_InputPipeline` handles invoking `input_fn` and piping to infeed queue.
 
@@ -880,8 +842,6 @@ class _InputPipeline(object):
     # structure is recorded.
     enqueue_ops = self._invoke_input_fn_and_record_structure()
 
-    self._validate_input_pipeline()
-
     def dequeue_fn():
       """dequeue_fn is used by TPU to retrieve the tensors."""
       values = self._infeed_queue.generate_dequeue_op()
@@ -892,15 +852,15 @@ class _InputPipeline(object):
     return (enqueue_ops, dequeue_fn)
 
   def _invoke_input_fn_and_record_structure(self):
-    """Deploys the input pipeline and record input structure."""
-    enqueue_ops = []
-    infeed_queues = []
-    num_hosts = self._ctx.num_hosts
-    tpu_host_placement_fn = self._ctx.tpu_host_placement_function
     if self._sharded_per_core:
       # Per-Core input pipeline deployment.
+      tpu_host_placement_fn = self._ctx.tpu_host_placement_function
+      enqueue_ops = []
+      infeed_queues = []
+
       # Invoke input pipeline for each core and placed on the corresponding
       # host.
+      num_hosts = self._ctx.num_hosts
       for host_id in range(num_hosts):
         host_device = tpu_host_placement_fn(host_id=host_id)
         with ops.device(host_device):
@@ -917,52 +877,48 @@ class _InputPipeline(object):
             # Infeed_queue_getter must be called after enqueue_ops_fn is called.
             infeed_queues.append(infeed_queue_getter())
 
+      # infeed_queue is used to generate dequeue ops. The only thing it uses for
+      # dequeue is dtypes and types. So, any one can be used. Here, grab the
+      # first one.
+      self._infeed_queue = infeed_queues[0]
+      return enqueue_ops
+
     else:
-      for host_id in range(num_hosts):
-        host_device = tpu_host_placement_fn(host_id=host_id)
+      # TODO(b/67051042): Extend this to multi-host support.
+      host_id = 0
+      host_device = self._ctx.tpu_host_placement_function(host_id=host_id)
+      def enqueue_fn():
         with ops.device(host_device):
           with ops.name_scope('input_pipeline_task%d' % (host_id)):
-            enqueue_ops_fn, infeed_queue_getter = (
-                generate_per_host_enqueue_ops_fn_for_host(
-                    self._ctx, self._input_fn, self._inputs_structure_recorder,
-                    self._batch_axis, host_device))
-
-            if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-              enqueue_ops.append(_wrap_computation_in_while_loop(
-                  device=host_device, op_fn=enqueue_ops_fn))
+            inputs = self._input_fn()
+            if isinstance(inputs, tuple):
+              features, labels = inputs
             else:
-              enqueue_ops.append(enqueue_ops_fn())
-            infeed_queues.append(infeed_queue_getter())
-    # infeed_queue is used to generate dequeue ops. The only thing it uses for
-    # dequeue is dtypes and types. So, any one can be used. Here, grab the
-    # first one.
-    self._infeed_queue = infeed_queues[0]
-    return enqueue_ops
-
-  def _validate_input_pipeline(self):
-    # Perform some sanity checks to log user friendly information. We should
-    # error out to give users better error message. But, if
-    # _WRAP_INPUT_FN_INTO_WHILE_LOOP is False (legacy behavior), we cannot break
-    # user code, so, log a warning.
-    if ops.get_default_graph().get_collection(ops.GraphKeys.QUEUE_RUNNERS):
-      err_msg = ('Input pipeline contains one or more QueueRunners. '
-                 'These are not supported via TPUEstimator. You must convert '
-                 'your input pipeline to use `tf.data` instead (see '
-                 'https://www.tensorflow.org/programmers_guide/datasets for '
-                 'instructions.')
-      if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-        raise RuntimeError(err_msg)
-      else:
-        logging.warn(err_msg)
-    elif ops.get_default_graph().get_collection(ops.GraphKeys.SUMMARIES):
-      # Queue Runner has summary Ops by default. So here we use elif to do
-      # necessary checks for Dataset input pipeline only.
-      err_msg = ('Input pipeline contains `tf.summary` operations. '
-                 'These are not currently supported.')
+              features, labels = inputs, None
+            self._inputs_structure_recorder.validate_and_record_structure(
+                features, labels)
+            unsharded_tensor_list = (
+                self._inputs_structure_recorder.flatten_features_and_labels(
+                    features, labels))
+
+            self._infeed_queue = tpu_feed.InfeedQueue(
+                tuple_types=[t.dtype for t in unsharded_tensor_list],
+                tuple_shapes=[t.shape for t in unsharded_tensor_list],
+                shard_dimensions=self._batch_axis)
+            self._infeed_queue.set_number_of_shards(self._ctx.num_cores)
+
+            def placement_fn(core_id):
+              return self._ctx.tpu_host_placement_function(core_id=core_id)
+            return (
+                self._infeed_queue.split_inputs_and_generate_enqueue_ops(
+                    unsharded_tensor_list,
+                    placement_function=placement_fn))
+
       if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-        raise RuntimeError(err_msg)
+        return _wrap_computation_in_while_loop(device=host_device,
+                                               op_fn=enqueue_fn)
       else:
-        logging.warn(err_msg)
+        return enqueue_fn()
 
 
 class _ModelFnWrapper(object):
@@ -1440,6 +1396,12 @@ class TPUEstimator(estimator_lib.Estimator):
               'eval batch size {} must be divisible by number of shards {}'
               .format(eval_batch_size, config.tpu_config.num_shards))
 
+      if (config.tpu_config.num_shards > 8 and
+          config.tpu_config.per_host_input_for_training):
+        # TODO(b/67051042): Support per_host input pipelines when num_shards > 8
+        raise NotImplementedError(
+            'Per-host input pipelines only available for num_shards <= 8')
+
     # Verifies the model_fn signature according to Estimator framework.
     estimator_lib._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access
     # We cannot store config and params in this constructor as parent
diff --git a/tensorflow/contrib/training/python/training/hparam.py b/tensorflow/contrib/training/python/training/hparam.py
index 7db625cdd5..391899b34f 100644
--- a/tensorflow/contrib/training/python/training/hparam.py
+++ b/tensorflow/contrib/training/python/training/hparam.py
@@ -18,7 +18,6 @@ from __future__ import division
 from __future__ import print_function
 
 import json
-import numbers
 import re
 
 import six
@@ -77,7 +76,7 @@ def _process_scalar_value(name, parse_fn, var_type, m_dict, values,
       function.
 
   Raises:
-    ValueError: If the name has already been used.
+    ValueError: If the name has already been sued.
   """
   try:
     parsed_value = parse_fn(m_dict['val'])
@@ -139,54 +138,6 @@ def _process_list_value(name, parse_fn, var_type, m_dict, values,
     _parse_fail(name, var_type, m_dict['vals'], values)
 
 
-def _cast_to_type_if_compatible(name, param_type, value):
-  """Cast hparam to the provided type, if compatible.
-
-  Args:
-    name: Name of the hparam to be cast.
-    param_type: The type of the hparam.
-    value: The value to be cast, if compatible.
-
-  Returns:
-    The result of casting `value` to `param_type`.
-
-  Raises:
-    ValueError: If the type of `value` is not compatible with param_type.
-      * If `param_type` is a string type, but `value` is not.
-      * If `param_type` is a boolean, but `value` is not, or vice versa.
-      * If `param_type` is an integer type, but `value` is not.
-      * If `param_type` is a float type, but `value` is not a numeric type.
-  """
-  fail_msg = (
-      "Could not cast hparam '%s' of type '%s' from value %r" %
-      (name, param_type, value))
-
-  # Some callers use None, for which we can't do any casting/checking. :(
-  if issubclass(param_type, type(None)):
-    return value
-
-  # Avoid converting a non-string type to a string.
-  if (issubclass(param_type, (six.string_types, six.binary_type)) and
-      not isinstance(value, (six.string_types, six.binary_type))):
-    raise ValueError(fail_msg)
-
-  # Avoid converting a number or string type to a boolean or vice versa.
-  if issubclass(param_type, bool) != isinstance(value, bool):
-    raise ValueError(fail_msg)
-
-  # Avoid converting float to an integer (the reverse is fine).
-  if (issubclass(param_type, numbers.Integral) and
-      not isinstance(value, numbers.Integral)):
-    raise ValueError(fail_msg)
-
-  # Avoid converting a non-numeric type to a numeric type.
-  if (issubclass(param_type, numbers.Number) and
-      not isinstance(value, numbers.Number)):
-    raise ValueError(fail_msg)
-
-  return param_type(value)
-
-
 def parse_values(values, type_map):
   """Parses hyperparameter values from a string into a python map.
 
@@ -487,18 +438,17 @@ class HParams(object):
     Raises:
       ValueError: If there is a type mismatch.
     """
-    param_type, is_list = self._hparam_types[name]
+    _, is_list = self._hparam_types[name]
     if isinstance(value, list):
       if not is_list:
         raise ValueError(
             'Must not pass a list for single-valued parameter: %s' % name)
-      setattr(self, name, [
-          _cast_to_type_if_compatible(name, param_type, v) for v in value])
+      setattr(self, name, value)
     else:
       if is_list:
         raise ValueError(
             'Must pass a list for multi-valued parameter: %s.' % name)
-      setattr(self, name, _cast_to_type_if_compatible(name, param_type, value))
+      setattr(self, name, value)
 
   def parse(self, values):
     """Override hyperparameter values, parsing new values from a string.
diff --git a/tensorflow/contrib/training/python/training/hparam_test.py b/tensorflow/contrib/training/python/training/hparam_test.py
index 949c262f5b..f54514cefd 100644
--- a/tensorflow/contrib/training/python/training/hparam_test.py
+++ b/tensorflow/contrib/training/python/training/hparam_test.py
@@ -318,42 +318,13 @@ class HParamsTest(test.TestCase):
     self.assertEqual(3.0, hparams.b)
     self.assertEqual('relu4', hparams.c_c)
 
-  def testSetHParamListNonListMismatch(self):
+  def testSetHParamTypeMismatch(self):
     hparams = hparam.HParams(a=1, b=[2.0, 3.0])
     with self.assertRaisesRegexp(ValueError, r'Must not pass a list'):
       hparams.set_hparam('a', [1.0])
     with self.assertRaisesRegexp(ValueError, r'Must pass a list'):
       hparams.set_hparam('b', 1.0)
 
-  def testSetHParamTypeMismatch(self):
-    hparams = hparam.HParams(
-        int_=1, str_='str', bool_=True, float_=1.1, list_int=[1, 2], none=None)
-
-    with self.assertRaises(ValueError):
-      hparams.set_hparam('str_', 2.2)
-
-    with self.assertRaises(ValueError):
-      hparams.set_hparam('int_', False)
-
-    with self.assertRaises(ValueError):
-      hparams.set_hparam('bool_', 1)
-
-    with self.assertRaises(ValueError):
-      hparams.set_hparam('int_', 2.2)
-
-    with self.assertRaises(ValueError):
-      hparams.set_hparam('list_int', [2, 3.3])
-
-    with self.assertRaises(ValueError):
-      hparams.set_hparam('int_', '2')
-
-    # Casting int to float is OK
-    hparams.set_hparam('float_', 1)
-
-    # Getting stuck with NoneType :(
-    hparams.set_hparam('none', '1')
-    self.assertEqual('1', hparams.none)
-
   def testNonProtoFails(self):
     with self.assertRaisesRegexp(AssertionError, ''):
       hparam.HParams(hparam_def=1)
diff --git a/tensorflow/contrib/verbs/README.md b/tensorflow/contrib/verbs/README.md
index da5f2b0223..dcb390b0a5 100644
--- a/tensorflow/contrib/verbs/README.md
+++ b/tensorflow/contrib/verbs/README.md
@@ -1,4 +1,4 @@
-## How to compile and use RDMA-enabled TensorFlow
+## How to compile, use and configure RDMA-enabled TensorFlow
 1. Follow the regular TF compilation instructions. During configure step, if you want ibverbs based RDMA support, answer yes to this question:
 
     ```Do you wish to build TensorFlow with VERBS-RDMA support [y/N]```
@@ -7,6 +7,18 @@
 
     ```server = tf.train.Server(cluster, job_name="local", task_index=0, protocol='grpc+verbs') # default protocol is 'grpc'```
 
+3. RDMA configuration is done by setting the following environment variables:
+   * **RDMA_DEVICE**: The RDMA device name to be used. If not defined by user, a default device with an active port will be set if exists.
+   * **RDMA_DEVICE_PORT**: The port within the selected device. Not relevant if RDMA_DEVICE is not defined. If not defined by user, a default active port will be set if exists.
+   * **RDMA_GID_INDEX**: The GID index of the port. If not defined by user, a default suitable GID index will be set (RoCEV2 is favourable as default).
+   * **RDMA_QP_PKEY_INDEX**: The Pkey for the QP. If not defined by user, the default value is 0.
+   * **RDMA_QP_QUEUE_DEPTH**: TX/RX queue size for the QP. If not defined by user, the default value is 1024.
+   * **RDMA_QP_TIMEOUT**: The retransmission timeout for QPs. If not defined by user, the default value is 14.
+   * **RDMA_QP_RETRY_COUNT**: Number of retransmission for QPs. If not defined by user, the default value is 7.
+   * **RDMA_QP_SL**: Service level configuration for QOS and ECN, valid values are 0-7. If not defined by user, the default value is 0.
+   * **RDMA_QP_MTU**: MTU configuration for the QPs. If not defined by user, the default value is active MTU from query_port.
+   * **RDMA_TRAFFIC_CLASS**: Traffic class configuration for QP, in case of DSCP trust level QoS configuration. If not defined by user, the default value is 0. For more info see [HowTo Configure Trust state on Mellanox Adapters](https://community.mellanox.com/docs/DOC-2866).
+
 ## Overview
 The design is based on TensorFlow r1.0. An RDMA path is added between servers for tensor transfer (weights, gradients, etc). The existing GRPC path remains and is responsible for "administrative" tasks, such as setting up the RDMA path, exchanging computation graphs, etc.
 
diff --git a/tensorflow/contrib/verbs/rdma.cc b/tensorflow/contrib/verbs/rdma.cc
index 26e18b28aa..331943a3ef 100644
--- a/tensorflow/contrib/verbs/rdma.cc
+++ b/tensorflow/contrib/verbs/rdma.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/verbs/rdma.h"
 #include <cstdlib>
+#include <fcntl.h>
 #include "tensorflow/contrib/verbs/verbs_util.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
@@ -33,6 +34,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+#define RoCE_V2 "RoCE v2"
+
 namespace {
 // hash name to 32-bit integer
 uint32_t NameHash(const string& name) {
@@ -66,16 +69,337 @@ string MessageTypeToString(RdmaMessageType rmt) {
 }
 }  // namespace
 
-ibv_context* open_default_device() {
+// Function to get environment variable
+// Args:
+//    var_name - the name of the environmental variable
+// Returns:
+//    string with it's value or empty string if not set
+string get_env_var(char const* var_name) {
+  char const* var_temp = getenv(var_name);
+
+  return (var_temp == NULL) ? string() : string(var_temp);
+}
+
+// Function to open device
+// Args:
+//   ibv_dev device to open
+// Returns:
+//   context of the opened device
+ibv_context* open_device(ibv_device* ibv_dev) {
+  ibv_context* context = ibv_open_device(ibv_dev);
+
+  CHECK(context) << "Open context failed for " << ibv_get_device_name(ibv_dev);
+  return context;
+}
+
+// Function to count the number of active ports for device
+// Args:
+//   device - to check active ports
+// Returns:
+//   number of active ports of the given device
+int get_dev_active_port_count(ibv_device* device) {
+  ibv_device_attr device_att;
+  ibv_port_attr port_attr;
+  ibv_context* context = NULL;
+  int rc, port_index, active_ports = 0;
+
+  context = ibv_open_device(device);
+  CHECK(context) << "Open context failed for " << ibv_get_device_name(device);
+  rc = ibv_query_device(context, &device_att);
+  CHECK(!rc) << "Failed to query the device";
+
+  for (port_index = 1; port_index <= device_att.phys_port_cnt; port_index++) {
+    rc = ibv_query_port(context, port_index, &port_attr);
+    CHECK(!rc) << "Failed to query the port" << port_index;
+    if (port_attr.state == IBV_PORT_ACTIVE) {
+      active_ports++;
+    }
+  }
+  ibv_close_device(context);
+  return active_ports;
+}
+
+// Function to set device. If RDMA_DEVICE not set, search for device with active
+// port.
+// Fails if more than one device with active port was found.
+// Returns:
+//   device to use
+ibv_device* set_device() {
   ibv_device** dev_list;
-  ibv_device* ib_dev;
-  dev_list = ibv_get_device_list(NULL);
+  int dev_num, device_index, device_to_open = 0;
+  int num_devs_with_active_port = 0;
+  string env_p_rdma_device, str_port_num;
+
+  dev_list = ibv_get_device_list(&dev_num);
   CHECK(dev_list) << "No InfiniBand device found";
-  ib_dev = dev_list[0];
-  CHECK(ib_dev) << "No InfiniBand device found";
-  ibv_context* context = ibv_open_device(ib_dev);
-  CHECK(context) << "Open context failed for " << ibv_get_device_name(ib_dev);
-  return context;
+
+  env_p_rdma_device = get_env_var("RDMA_DEVICE");
+  if (!env_p_rdma_device.empty()) {
+    for (device_index = 0; device_index < dev_num; device_index++) {
+      if (!env_p_rdma_device.compare(
+               ibv_get_device_name(dev_list[device_index]))) {
+        CHECK(get_dev_active_port_count(dev_list[device_index]) != 0)
+            << "Device " << ibv_get_device_name(dev_list[device_index])
+            << " has no active ports";
+        return dev_list[device_index];
+      }
+    }
+    // check validity of input device
+    CHECK(false) << "The device " << env_p_rdma_device << " wasn't found";
+  } else {
+  // set default device
+    str_port_num = get_env_var("RDMA_DEVICE_PORT");
+    CHECK(str_port_num.empty())
+        << "RDMA_DEVICE should be provided if RDMA_DEVICE_PORT is set by user";
+    for (device_index = 0; device_index < dev_num; device_index++) {
+      // get port_num
+      if (get_dev_active_port_count(dev_list[device_index]) > 0) {
+        num_devs_with_active_port++;
+        CHECK(num_devs_with_active_port <= 1) << ". More than one device with "
+                                                 "active port in the system. "
+                                                 "Please enter RDMA_DEVICE";
+        // found device with at least 1 active port
+        device_to_open = device_index;
+      }
+    }
+    CHECK(num_devs_with_active_port > 0)
+        << "There is no active port in the system";
+    return dev_list[device_to_open];
+  }
+  CHECK(false) << "No device was set!";
+  return NULL;  // never happens
+}
+
+// Function to set port for device.
+// If RDMA_DEVICE_PORT not set, first active port of the device will be set.
+// Args:
+//   context of the device
+// Returns:
+//   port to use
+uint8_t set_port(ibv_context* context) {
+  uint8_t port_num = 0; //0 is illegal port number
+  string str_port_num;
+  ibv_device_attr device_att;
+  ibv_port_attr port_attr;
+  int rc, port_index;
+
+  rc = ibv_query_device(context, &device_att);
+  CHECK(!rc) << "Failed to query the device\n";
+
+  str_port_num = get_env_var("RDMA_DEVICE_PORT");
+  // user defined port
+  if (!str_port_num.empty()) {
+    port_num = stoi(str_port_num);
+    CHECK(port_num > 0) << "RDMA_DEVICE_PORT should be positive";
+    CHECK(port_num <= device_att.phys_port_cnt) << "RDMA_DEVICE_PORT should be "
+                                                   "less or equal to amount of "
+                                                   "available ports";
+    rc = ibv_query_port(context, port_num, &port_attr);
+    CHECK(!rc) << "Failed to query the port" << port_num;
+    // check if port id active
+    CHECK(port_attr.state == IBV_PORT_ACTIVE)
+        << "Selected RDMA_DEVICE_PORT is not active";
+  }
+  // set default port
+  else {
+    for (port_index = 1; port_index <= device_att.phys_port_cnt; port_index++) {
+      rc = ibv_query_port(context, port_index, &port_attr);
+      CHECK(!rc) << "Failed to query the port" << port_index;
+      if (port_attr.state == IBV_PORT_ACTIVE) {
+        port_num = port_index;
+        break;
+      }
+    }
+    CHECK_GT(port_num, 0) << "No active ports";
+  }
+  return port_num;
+}
+
+// Function read from sysfs file
+// Args:
+//   dir - directory
+//   file - file
+//   buff - buffer for the result
+//   size - buffer size
+// Returns:
+//   number of bytes were read or -1 if failed
+int read_sysfs_file(const char* dir, const char* file, char* buf, size_t size) {
+  char* path;
+  int fd;
+  int len;
+
+  if (asprintf(&path, "%s/%s", dir, file) < 0) return -1;
+
+  fd = open(path, O_RDONLY);
+  if (fd < 0) {
+    free(path);
+    return -1;
+  }
+
+  len = read(fd, buf, size);
+
+  close(fd);
+  free(path);
+
+  if (len > 0 && buf[len - 1] == '\n') buf[--len] = '\0';
+
+  return len;
+}
+
+// Function to check if GID index support RoCE V2
+// Args:
+//   context - device context
+//   port_num - port number
+//   index -  GID index
+// Returns:
+//   if GID supports RoCE V2 - true, otherwise - false.
+bool is_gid_type_roce_v2(ibv_context* context, uint8_t port_num,
+                         uint8_t index) {
+  char name[32];
+  char buff[41];
+
+  snprintf(name, sizeof(name), "ports/%d/gid_attrs/types/%d", port_num, index);
+  if (read_sysfs_file(context->device->ibdev_path, name, buff, sizeof(buff)) <=
+      0) {
+    return false;
+  }
+  return !strcmp(buff, RoCE_V2);
+}
+
+// Function to set GID index.
+// If the port link is IB, no GID index should be selected.
+// If Ethernet but RDMA_GID_INDEX not set gid index that supports
+//   RoCE V2 will be chosen(fails if more then one IP is configured)
+// Args:
+//   context - device context
+//   port_num - port number
+// Returns:
+//   GID index to use
+uint8_t set_gid(uint8_t port_num, ibv_context* context) {
+  ibv_port_attr port_attr;
+  string gid_str;
+  int rc, i, gids_num = 0, v2_ip_num = 0;
+  union ibv_gid gid;
+  uint8_t gid_index = 0;
+
+  rc = ibv_query_port(context, port_num, &port_attr);
+  CHECK(!rc) << "Failed to query the port" << port_num;
+
+  for (i = 0; i < port_attr.gid_tbl_len; i++) {
+    rc = ibv_query_gid(context, port_num, i, &gid);
+    CHECK(!rc) << "Failed to query gid to port " << (int)port_num << " index "
+               << i;
+    if (gid.global.interface_id) {
+      gids_num++;
+      if (gid.global.subnet_prefix == 0 &&
+          is_gid_type_roce_v2(context, port_num, i)) {
+        if (v2_ip_num == 0) {
+          // can be overwritten by RDMA_GID_INDEX later
+          gid_index = i;
+        }
+        v2_ip_num++;
+      }
+    }
+  }
+  switch (port_attr.link_layer) {
+    case(IBV_LINK_LAYER_ETHERNET) :
+      gid_str = get_env_var("RDMA_GID_INDEX");
+      if (!gid_str.empty()) {
+        gid_index = stoi(gid_str);
+        CHECK(gid_index < gids_num)
+            << "RDMA_GID_INDEX should be less than GIDs amount" << gids_num;
+      } else {
+        CHECK(v2_ip_num <= 1)
+            << "More than one IP is available, please specify GID_INDEX";
+      }
+      break;
+    case(IBV_LINK_LAYER_INFINIBAND) :  // no need in GID index
+      break;
+    default:
+      LOG(INFO) << "Unknown port link layer. Currently supporting Ethernet and "
+                   "InfiniBand only. ";
+  }
+  if (!is_gid_type_roce_v2(context, port_num, gid_index)) {
+    LOG(INFO) << "RoCE v2 is not configured for GID_INDEX " << (int)gid_index;
+  }
+  return gid_index;
+}
+
+// set the default or environment value to the configuration parameter.
+// Args:
+//   default_val- the default value for this parameter
+//   env_param- the environment parameter's name
+// Returns:
+//   32-bit value
+uint32_t set_param(uint32_t default_val, const char* env_param) {
+  uint32_t val = default_val;
+  string val_s;
+
+  val_s = get_env_var(env_param);
+
+  if (!val_s.empty()) {
+    val = stoi(val_s);
+  }
+  return val;
+}
+
+enum ibv_mtu set_mtu(uint8_t port_num, ibv_context* context) {
+  ibv_port_attr port_attr;
+  enum ibv_mtu mtu;
+  string mtu_s;
+  int rc, mtu_i;
+
+  rc = ibv_query_port(context, port_num, &port_attr);
+  CHECK(!rc) << "Failed to query the port" << port_num;
+
+  mtu_s = get_env_var("RDMA_MTU");
+
+  if (!mtu_s.empty()) {
+    mtu_i = stoi(mtu_s);
+    switch (mtu_i) {
+      case 256:
+        mtu = IBV_MTU_256;
+        break;
+      case 512:
+        mtu = IBV_MTU_512;
+        break;
+      case 1024:
+        mtu = IBV_MTU_1024;
+        break;
+      case 2048:
+        mtu = IBV_MTU_2048;
+        break;
+      case 4096:
+        mtu = IBV_MTU_4096;
+        break;
+      default:
+        CHECK(0) << "Error: MTU input value must be one of the following: 256, "
+                    "512, 1024, 2048, 4096. MTU " << mtu << " is invalid\n";
+        break;
+    }
+    CHECK(mtu < port_attr.active_mtu)
+        << "MTU configuration for the QPs is larger than active MTU";
+  } else {
+    mtu = port_attr.active_mtu;
+  }
+  return mtu;
+}
+
+RdmaParams params_init(ibv_context* context) {
+  RdmaParams params;
+
+  params.port_num = set_port(context);
+  params.sgid_index = set_gid(params.port_num, context);
+  params.pkey_index = (uint8_t)set_param(PKEY_DEFAULT, "RDMA_PKEY");
+  params.queue_depth = set_param(QUEUE_DEPTH_DEFAULT, "RDMA_QUEUE_DEPTH");
+  params.timeout = (uint8_t)set_param(TIMEOUT_DEFAULT, "RDMA_TIMEOUT");
+  params.retry_cnt = (uint8_t)set_param(RETRY_CNT_DEFAULT, "RDMA_RETRY_CNT");
+  params.sl = (uint8_t)set_param(SL_DEFAULT, "RDMA_SL");
+  CHECK(params.sl <= 7) << "SL value is " << (int)params.sl
+                        << ". Valid values are 0-7.";
+  params.mtu = set_mtu(params.port_num, context);
+  params.traffic_class = set_param(TRAFFIC_CLASS, "RDMA_TRAFFIC_CLASS");
+  return params;
 }
 
 ibv_pd* alloc_protection_domain(ibv_context* context) {
@@ -85,7 +409,8 @@ ibv_pd* alloc_protection_domain(ibv_context* context) {
 }
 
 RdmaAdapter::RdmaAdapter(const WorkerEnv* worker_env)
-    : context_(open_default_device()),
+    : context_(open_device(set_device())),
+      params_(params_init(context_)),
       pd_(alloc_protection_domain(context_)),
       worker_env_(worker_env) {
   event_channel_ = ibv_create_comp_channel(context_);
@@ -128,9 +453,9 @@ void RdmaAdapter::Process_CQ() {
     CHECK_GE(ne, 0);
     for (int i = 0; i < ne; ++i) {
       CHECK(wc_[i].status == IBV_WC_SUCCESS)
-          << "Failed status \n"
-          << ibv_wc_status_str(wc_[i].status) << " " << wc_[i].status << " "
-          << static_cast<int>(wc_[i].wr_id) << " " << wc_[i].vendor_err;
+          << "Failed status \n" << ibv_wc_status_str(wc_[i].status) << " "
+          << wc_[i].status << " " << static_cast<int>(wc_[i].wr_id) << " "
+          << wc_[i].vendor_err;
       if (wc_[i].opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
         RdmaChannel* rc = reinterpret_cast<RdmaChannel*>(wc_[i].wr_id);
         // put back a recv wr.
@@ -242,8 +567,8 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
     memset(&attr, 0, sizeof(ibv_qp_init_attr));
     attr.send_cq = adapter_->cq_;
     attr.recv_cq = adapter_->cq_;
-    attr.cap.max_send_wr = RdmaAdapter::MAX_CONCURRENT_WRITES;
-    attr.cap.max_recv_wr = RdmaAdapter::MAX_CONCURRENT_WRITES;
+    attr.cap.max_send_wr = adapter_->params_.queue_depth;
+    attr.cap.max_recv_wr = adapter_->params_.queue_depth;
     attr.cap.max_send_sge = 1;
     attr.cap.max_recv_sge = 1;
     attr.qp_type = IBV_QPT_RC;
@@ -257,8 +582,8 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
     struct ibv_qp_attr attr;
     memset(&attr, 0, sizeof(ibv_qp_attr));
     attr.qp_state = IBV_QPS_INIT;
-    attr.pkey_index = 0;
-    attr.port_num = 1;
+    attr.pkey_index = adapter_->params_.pkey_index;
+    attr.port_num = adapter_->params_.port_num;
     attr.qp_access_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
 
     int mask =
@@ -269,13 +594,15 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
   // Local address
   {
     struct ibv_port_attr attr;
-    CHECK(!ibv_query_port(adapter_->context_, (uint8_t)1, &attr))
+    CHECK(
+        !ibv_query_port(adapter_->context_, adapter_->params_.port_num, &attr))
         << "Query port";
     self_.lid = attr.lid;
     self_.qpn = qp_->qp_num;
     self_.psn = static_cast<uint32_t>(random::New64()) & 0xffffff;
     union ibv_gid gid;
-    CHECK(!ibv_query_gid(adapter_->context_, (uint8_t)1, 0, &gid))
+    CHECK(!ibv_query_gid(adapter_->context_, adapter_->params_.port_num,
+                         adapter_->params_.sgid_index, &gid))
         << "Query gid";
     self_.snp = gid.global.subnet_prefix;
     self_.iid = gid.global.interface_id;
@@ -284,7 +611,7 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
   // create message and ack buffers, then initialize the tables.
   {
     const string buffer_names[] = {"tx_message_buffer", "rx_message_buffer",
-                                   "tx_ack_buffer", "rx_ack_buffer"};
+                                   "tx_ack_buffer",     "rx_ack_buffer"};
     tx_message_buffer_ = new RdmaMessageBuffer(this, buffer_names[0]);
     rx_message_buffer_ = new RdmaMessageBuffer(this, buffer_names[1]);
     tx_ack_buffer_ = new RdmaAckBuffer(this, buffer_names[2]);
@@ -345,7 +672,7 @@ void RdmaChannel::SetRemoteAddress(const RdmaAddress& ra, bool override) {
 void RdmaChannel::Recv() {
   struct ibv_recv_wr wr;
   memset(&wr, 0, sizeof(wr));
-  wr.wr_id = (uint64_t)this;
+  wr.wr_id = (uint64_t) this;
   struct ibv_recv_wr* bad_wr;
   CHECK(!ibv_post_recv(qp_, &wr, &bad_wr)) << "Failed to post recv";
 }
@@ -479,11 +806,9 @@ void RdmaChannel::Connect(const RdmaAddress& remoteAddr) {
     struct ibv_qp_attr attr;
     memset(&attr, 0, sizeof(ibv_qp_attr));
     attr.qp_state = IBV_QPS_RTR;
-    struct ibv_port_attr port_attr;
-    CHECK(!ibv_query_port(adapter_->context_, (uint8_t)1, &port_attr))
-        << "Query port failed";
+
     // This assumes both QP's ports are configured with the same MTU
-    attr.path_mtu = port_attr.active_mtu;
+    attr.path_mtu = adapter_->params_.mtu;
     attr.dest_qp_num = remoteAddr.qpn;
     attr.rq_psn = remoteAddr.psn;
     attr.max_dest_rd_atomic = 1;
@@ -494,30 +819,32 @@ void RdmaChannel::Connect(const RdmaAddress& remoteAddr) {
     attr.ah_attr.grh.flow_label = 0;
     attr.ah_attr.grh.hop_limit = 255;
     attr.ah_attr.dlid = remoteAddr.lid;
-    attr.ah_attr.sl = 0;
+    attr.ah_attr.sl = adapter_->params_.sl;
     attr.ah_attr.src_path_bits = 0;
-    attr.ah_attr.port_num = 1;
+    attr.ah_attr.port_num = adapter_->params_.port_num;
+    attr.ah_attr.grh.sgid_index = adapter_->params_.sgid_index;
+    attr.ah_attr.grh.traffic_class = adapter_->params_.traffic_class;
 
     int r;
-    CHECK(!(r = ibv_modify_qp(qp_, &attr,
-                              IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU |
-                                  IBV_QP_DEST_QPN | IBV_QP_RQ_PSN |
-                                  IBV_QP_MAX_DEST_RD_ATOMIC |
-                                  IBV_QP_MIN_RNR_TIMER)))
+    CHECK(!(r = ibv_modify_qp(qp_, &attr, IBV_QP_STATE | IBV_QP_AV |
+                                              IBV_QP_PATH_MTU |
+                                              IBV_QP_DEST_QPN | IBV_QP_RQ_PSN |
+                                              IBV_QP_MAX_DEST_RD_ATOMIC |
+                                              IBV_QP_MIN_RNR_TIMER)))
         << "QP to Ready to Receive " << r;
 
     memset(&attr, 0, sizeof(ibv_qp_attr));
     attr.qp_state = IBV_QPS_RTS;
     attr.sq_psn = self_.psn;
-    attr.timeout = 14;
-    attr.retry_cnt = 7;
+    attr.timeout = adapter_->params_.timeout;
+    attr.retry_cnt = adapter_->params_.retry_cnt;
     attr.rnr_retry = 7; /* infinite */
     attr.max_rd_atomic = 1;
 
-    CHECK(!(r = ibv_modify_qp(qp_, &attr,
-                              IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT |
-                                  IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN |
-                                  IBV_QP_MAX_QP_RD_ATOMIC)))
+    CHECK(!(r = ibv_modify_qp(qp_, &attr, IBV_QP_STATE | IBV_QP_TIMEOUT |
+                                              IBV_QP_RETRY_CNT |
+                                              IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN |
+                                              IBV_QP_MAX_QP_RD_ATOMIC)))
         << "QP to Ready to Send " << r;
 
     connected_ = true;
@@ -604,7 +931,7 @@ void RdmaBuffer::Write(uint32_t imm_data, size_t buffer_size) {
 
   struct ibv_send_wr wr;
   memset(&wr, 0, sizeof(wr));
-  wr.wr_id = (uint64_t)this;
+  wr.wr_id = (uint64_t) this;
   wr.sg_list = &list;
   wr.num_sge = 1;
   wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
@@ -699,9 +1026,9 @@ Rendezvous::DoneCallback RdmaTensorBuffer::getRecvTensorCallback(
     TensorProto proto;
     if (src_dev->tensorflow_gpu_device_info() &&
         (!send_args.alloc_attrs.on_host())) {
-      CHECK(send_args.device_context)
-          << "send dev name: " << src_dev->name()
-          << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
+      CHECK(send_args.device_context) << "send dev name: " << src_dev->name()
+                                      << " gpu_info: "
+                                      << src_dev->tensorflow_gpu_device_info();
 
       if (can_memcpy) {
         AllocatorAttributes host_alloc_attrs;
@@ -727,8 +1054,8 @@ Rendezvous::DoneCallback RdmaTensorBuffer::getRecvTensorCallback(
         // aync instead
         GPUUtil::SetProtoFromGPU(
             in, src_dev, send_args.device_context, &proto, is_dead,
-            [this, proto, buffer_size, key, in, step_id, key_with_step_id,
-             is_dead, send_args, recv_args](const Status& s) mutable {
+	    [this, proto, buffer_size, key, in, step_id, key_with_step_id,
+            is_dead, send_args, recv_args](const Status& s) mutable {
               CHECK(s.ok()) << "copy proto from gpu sync";
               auto tensor_bytes = proto.ByteSize();
               buffer_size += tensor_bytes;
diff --git a/tensorflow/contrib/verbs/rdma.h b/tensorflow/contrib/verbs/rdma.h
index e1e07db776..52d92a7c5b 100644
--- a/tensorflow/contrib/verbs/rdma.h
+++ b/tensorflow/contrib/verbs/rdma.h
@@ -36,7 +36,24 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
-
+#define PKEY_DEFAULT 0
+#define QUEUE_DEPTH_DEFAULT 1024
+#define TIMEOUT_DEFAULT 14
+#define RETRY_CNT_DEFAULT 7
+#define SL_DEFAULT 0
+#define TRAFFIC_CLASS 0
+
+struct RdmaParams {
+  uint8_t port_num;
+  uint8_t sgid_index;
+  uint8_t pkey_index;
+  uint32_t queue_depth;
+  uint8_t timeout;
+  uint8_t retry_cnt;
+  uint8_t sl;
+  enum ibv_mtu mtu;
+  uint8_t traffic_class;
+};
 // structure to save the address of remote channels.
 struct RdmaAddress {
   uint32_t lid;
@@ -50,9 +67,20 @@ struct RemoteMR {
   uint64_t remote_addr;
   uint32_t rkey;
 };
-enum BufferStatus { none, idle, busy };
-enum Location { local, remote };
-enum BufferType { ACK, MESSAGE, TENSOR };
+enum BufferStatus {
+  none,
+  idle,
+  busy
+};
+enum Location {
+  local,
+  remote
+};
+enum BufferType {
+  ACK,
+  MESSAGE,
+  TENSOR
+};
 enum RdmaMessageType {
   RDMA_MESSAGE_ACK,
   RDMA_MESSAGE_BUFFER_IDLE,
@@ -84,6 +112,8 @@ class RdmaAdapter {
  protected:
   static const int MAX_CONCURRENT_WRITES = 1000;
   ibv_context* context_;
+  // RDMA configuration parameters
+  RdmaParams params_;
   // ibverbs protection domain
   ibv_pd* pd_;
   // Completion event channel, to wait for work completions
@@ -183,7 +213,7 @@ class RdmaBuffer {
   }
   void FreeBuffer();
   void EnqueueItem(string Item);
-  virtual void SendNextItem(){};
+  virtual void SendNextItem() {};
   void CreateCPUBuffer(size_t size, bool lock = true);
   void SetRemoteMR(RemoteMR rmi, bool override);
   uint32_t LookupBufferIndex(const string& buffer_name) {
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 18d69fceb3..9530af637e 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2710,6 +2710,7 @@ tf_cc_test_mkl(
     srcs = [
         "graph/mkl_layout_pass_test.cc",
         "graph/mkl_tfconversion_pass_test.cc",
+        "util/mkl_util_test.cc",
     ],
     linkstatic = 1,
     deps = [
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 6399b8cf55..38fe247521 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -296,13 +296,12 @@ void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
         // it from the free bin structure prior to using.
         RemoveFreeChunkIterFromBin(&b->free_chunks, citer);
 
-        // If we can break the size of the chunk into two reasonably large
-        // pieces, do so.  In any case don't waste more than
-        // kMaxInternalFragmentation bytes on padding this alloc.
-        const int64 kMaxInternalFragmentation = 128 << 20;  // 128mb
-        if (chunk->size >= rounded_bytes * 2 ||
-            static_cast<int64>(chunk->size) - rounded_bytes >=
-                kMaxInternalFragmentation) {
+        // If we can break the size of the chunk into two reasonably
+        // large pieces, do so.
+        //
+        // TODO(vrv): What should be the criteria when deciding when
+        // to split?
+        if (chunk->size >= rounded_bytes * 2) {
           SplitChunk(h, rounded_bytes);
           chunk = ChunkFromHandle(h);  // Update chunk pointer in case it moved
         }
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 53e80b1ee3..63b74e8dbf 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -81,7 +81,7 @@ class MklCPUAllocator : public Allocator {
       }
 #if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE)
       if (user_val > max_mem_bytes) {
-        LOG(WARNING) << "The user specifed a memory limit " << kMaxLimitStr
+        LOG(WARNING) << "The user specified a memory limit " << kMaxLimitStr
                      << "=" << user_val
                      << " greater than available physical memory: "
                      << max_mem_bytes
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.h b/tensorflow/core/common_runtime/sycl/sycl_device.h
index 9caa076c72..cc272d156e 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.h
@@ -46,8 +46,8 @@ class GSYCLInterface {
 
     if (!found_device) {
       // Currently Intel GPU is not supported
-      LOG(WARNING) << "No OpenCL GPU found that is supported by ComputeCpp, "
-                      "trying OpenCL CPU";
+      LOG(WARNING) << "No OpenCL GPU found that is supported by "
+                   << "ComputeCpp/triSYCL, trying OpenCL CPU";
     }
 
     for (const auto& device : device_list) {
@@ -59,9 +59,23 @@ class GSYCLInterface {
     }
 
     if (!found_device) {
+      LOG(WARNING) << "No OpenCL CPU found that is supported by "
+                   << "ComputeCpp/triSYCL, checking for host sycl device";
+    }
+
+    for (const auto& device : device_list) {
+      // triSYCL only supports the host device for now
+      if (device.is_host()) {
+        LOG(WARNING) << "Found SYCL host device";
+        AddDevice(device);
+        found_device = true;
+      }
+    }
+
+    if (!found_device) {
       // Currently Intel GPU is not supported
-      LOG(FATAL)
-          << "No OpenCL GPU nor CPU found that is supported by ComputeCpp";
+      LOG(FATAL) << "No SYCL host and no OpenCL GPU nor CPU"
+                 << " supported by ComputeCPP/triSYCL was found";
     } else {
       LOG(INFO) << "Found following OpenCL devices:";
       for (int i = 0; i < device_list.size(); i++) {
diff --git a/tensorflow/core/framework/bfloat16.cc b/tensorflow/core/framework/bfloat16.cc
index 1a6f355c77..a5ac0e1a8d 100644
--- a/tensorflow/core/framework/bfloat16.cc
+++ b/tensorflow/core/framework/bfloat16.cc
@@ -18,24 +18,32 @@ limitations under the License.
 namespace tensorflow {
 
 void FloatToBFloat16(const float* src, bfloat16* dst, int64 size) {
-  for (int64 i = 0; i < size; ++i) {
-    dst[i] = bfloat16(src[i]);
-  }
+  const uint16_t* p = reinterpret_cast<const uint16_t*>(src);
+  uint16_t* q = reinterpret_cast<uint16_t*>(dst);
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    for (; size != 0; p += 2, q++, size--) {  
+      *q = p[0];  
+    }  
+#else
+    for (; size != 0; p += 2, q++, size--) {  
+     *q = p[1];  
+    }  
+#endif
 }
 
 void BFloat16ToFloat(const bfloat16* src, float* dst, int64 size) {
   const uint16_t* p = reinterpret_cast<const uint16_t*>(src);
   uint16_t* q = reinterpret_cast<uint16_t*>(dst);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-  for (; size != 0; p++, q += 2, size--) {
-    q[0] = *p;
-    q[1] = 0;
+    for (; size != 0; p++, q += 2, size--) {  
+      q[0] = *p;  
+      q[1] = 0;  
     }
-#else
-  for (; size != 0; p++, q += 2, size--) {
-    q[0] = 0;
-    q[1] = *p;
-  }
+#else  
+    for (; size != 0; p++, q += 2, size--) {  
+      q[0] = 0;  
+      q[1] = *p;  
+    } 
 #endif
 }
 
diff --git a/tensorflow/core/framework/bfloat16_test.cc b/tensorflow/core/framework/bfloat16_test.cc
index a25b764ea2..af4e6a4411 100644
--- a/tensorflow/core/framework/bfloat16_test.cc
+++ b/tensorflow/core/framework/bfloat16_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/bfloat16.h"
 
-#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -28,97 +27,6 @@ TEST(Bfloat16Test, Simple) {
   EXPECT_EQ(0x4140, a.value);
 }
 
-float BinaryToFloat(uint32_t sign, uint32_t exponent, uint32_t high_mantissa,
-                    uint32_t low_mantissa) {
-  return bit_cast<float>((sign << 31) + (exponent << 23) +
-                         (high_mantissa << 16) + low_mantissa);
-}
-
-struct Bfloat16TestParam {
-  float input;
-  float expected;
-};
-
-class Bfloat16Test : public ::testing::Test,
-                     public ::testing::WithParamInterface<Bfloat16TestParam> {};
-
-TEST_P(Bfloat16Test, RoundOrTruncate) {
-  bfloat16 a(GetParam().input);
-  if (std::isnan(GetParam().input)) {
-    EXPECT_TRUE(std::isnan(float(a)));
-    return;
-  }
-  EXPECT_EQ(GetParam().expected, float(a));
-}
-
-INSTANTIATE_TEST_CASE_P(
-    Bfloat16Test_Instantiation, Bfloat16Test,
-    ::testing::Values(
-        // More than half.
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b1111010111000011),
-            BinaryToFloat(0, 0b10000000, 0b1001001, 0b0000000000000000)},
-
-        Bfloat16TestParam{
-            BinaryToFloat(1, 0b10000000, 0b1001000, 0b1111010111000011),
-            BinaryToFloat(1, 0b10000000, 0b1001001, 0b0000000000000000)},
-
-        // Exact half.
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b1000000000000000),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
-
-        // NaN stays at NaN.
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b11111111, 0b0000000, 0b0000000000000001),
-            BinaryToFloat(0, 0b11111111, 0b1000000, 0b0000000000000000)},
-
-        // NaN stays at NaN -- no exponents overflow.
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b11111111, 0b1111111, 0b1111111111111111),
-            BinaryToFloat(0, 0b11111111, 0b1000000, 0b0000000000000000)},
-
-        // More than half, round to an odd number.
-        Bfloat16TestParam{
-            BinaryToFloat(1, 0b10000000, 0b1001000, 0b1100000000000000),
-            BinaryToFloat(1, 0b10000000, 0b1001001, 0b0000000000000000)},
-
-        // Less than half, truncate.
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
-
-        // Less than half, truncate.
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0100000000000000),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
-
-        // Exact at half, but result is already even.
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b1000000000000000),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
-
-        // Denormal values.
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b00000000, 0b1001000, 0b1000000000000000),
-            BinaryToFloat(0, 0b00000000, 0b1001000, 0b0000000000000000)},
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b00000000, 0b1111111, 0b1100000000000000),
-            BinaryToFloat(0, 0b00000001, 0b0000000, 0b0000000000000000)}));
-TEST(Bfloat16Test, RoundWithFractionOverflow) {
-  // Still works with fraction overflow -- round to 4./
-  //
-  // Input 3.9960938:
-  // Sign |  Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit)
-  //  0     1 0 0 0 0 0 0      1 1 1 1 1 1 1     1100000000000000
-  //
-  // Should round to 4.0:
-  // Sign |  Exp (8 bit)  | Frac (first 7 bit)
-  //  0     1 0 0 0 0 0 1      0 0 0 0 0 0 0
-  bfloat16 a(3.9960938f);
-  EXPECT_EQ(4.0, float(a));
-}
-
 TEST(Bfloat16Test, Conversion) {
   float a[100];
   for (int i = 0; i < 100; ++i) {
diff --git a/tensorflow/core/framework/numeric_types.h b/tensorflow/core/framework/numeric_types.h
index d005de2af1..a630bee38d 100644
--- a/tensorflow/core/framework/numeric_types.h
+++ b/tensorflow/core/framework/numeric_types.h
@@ -44,262 +44,29 @@ typedef Eigen::QUInt16 quint16;
 // see framework/bfloat16.h for description.
 struct bfloat16 {
   EIGEN_DEVICE_FUNC bfloat16() {}
-
-  explicit EIGEN_DEVICE_FUNC bfloat16(float v) {
-    uint32_t input;
-    memcpy(&input, &v, sizeof(uint32_t));
-
-    if ((~input & 0x7f800000) == 0 && (input & 0x007fffff) != 0) {
-      // If the value is a NaN, squash it to a qNaN with msb of fraction set,
-      // this makes sure after truncation we don't end up with an inf.
-      //
-      // qNaN magic: All exponent bits set + most significant bit of fraction
-      // set.
-      value = 0x7fc0;
-    } else {
-      // Fast rounding algorithm that rounds a half value to nearest even. This
-      // reduces expected error when we convert a large number of floats. Here
-      // is how it works:
-      //
-      // Definitions:
-      // To convert a float 32 to bfloat16, a float 32 can be viewed as 32 bits
-      // with the following tags:
-      //
-      // Sign |  Exp (8 bits) | Frac (23 bits)
-      //  S     EEEEEEEE         FFFFFFLRTTTTTTTTTTTTTTT
-      //
-      //  S: Sign bit.
-      //  E: Exponent bits.
-      //  F: First 6 bits of fraction.
-      //  L: Least significant bit of resulting bfloat16 if we truncate away the
-      //  rest of the float32. This is also the 7th bit of fraction
-      //  R: Rounding bit, 8th bit of fraction.
-      //  T: Sticky bits, rest of fraction, 15 bits.
-      //
-      // To round half to nearest even, there are 3 cases where we want to round
-      // down (simply truncate the result of the bits away, which consists of
-      // rounding bit and sticky bits) and two cases where we want to round up
-      // (truncate then add one to the result).
-      //
-      // The fast converting algorithm simply adds lsb (L) to 0x7fff (15 bits of
-      // 1s) as the rounding bias, adds the rounding bias to the input, then
-      // truncates the last 16 bits away.
-      //
-      // To understand how it works, we can analyze this algorithm case by case:
-      //
-      // 1. L = 0, R = 0:
-      //   Expect: round down, this is less than half value.
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 0 = 0x7fff
-      //   - Adding rounding bias to input may create any carry, depending on
-      //   whether there is any value set to 1 in T bits.
-      //   - R may be set to 1 if there is a carry.
-      //   - L remains 0.
-      //   - Note that this case also handles Inf and -Inf, where all fraction
-      //   bits, including L, R and Ts are all 0. The output remains Inf after
-      //   this algorithm.
-      //
-      // 2. L = 1, R = 0:
-      //   Expect: round down, this is less than half value.
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 1 = 0x8000
-      //   - Adding rounding bias to input doesn't change sticky bits but
-      //   adds 1 to rounding bit.
-      //   - L remains 1.
-      //
-      // 3. L = 0, R = 1, all of T are 0:
-      //   Expect: round down, this is exactly at half, the result is already
-      //   even (L=0).
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 0 = 0x7fff
-      //   - Adding rounding bias to input sets all sticky bits to 1, but
-      //   doesn't create a carry.
-      //   - R remains 1.
-      //   - L remains 0.
-      //
-      // 4. L = 1, R = 1:
-      //   Expect: round up, this is exactly at half, the result needs to be
-      //   round to the next even number.
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 1 = 0x8000
-      //   - Adding rounding bias to input doesn't change sticky bits, but
-      //   creates a carry from rounding bit.
-      //   - The carry sets L to 0, creates another carry bit and propagate
-      //   forward to F bits.
-      //   - If all the F bits are 1, a carry then propagates to the exponent
-      //   bits, which then creates the minimum value with the next exponent
-      //   value. Note that we won't have the case where exponents are all 1,
-      //   since that's either a NaN (handled in the other if condition) or inf
-      //   (handled in case 1).
-      //
-      // 5. L = 0, R = 1, any of T is 1:
-      //   Expect: round up, this is greater than half.
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 0 = 0x7fff
-      //   - Adding rounding bias to input creates a carry from sticky bits,
-      //   sets rounding bit to 0, then create another carry.
-      //   - The second carry sets L to 1.
-      //
-      // Examples:
-      //
-      //  Exact half value that is already even:
-      //    Input:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0     1000000000000000
-      //
-      //     This falls into case 3. We truncate the rest of 16 bits and no
-      //     carry is created into F and L:
-      //
-      //    Output:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-      //     S     E E E E E E E E      F F F F F F L
-      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
-      //
-      //  Exact half value, round to next even number:
-      //    Input:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 0 1     1000000000000000
-      //
-      //     This falls into case 4. We create a carry from R and T,
-      //     which then propagates into L and F:
-      //
-      //    Output:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-      //     S     E E E E E E E E      F F F F F F L
-      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
-      //
-      //
-      //  Max denormal value round to min normal value:
-      //    Input:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-      //     0     0 0 0 0 0 0 0 0      1 1 1 1 1 1 1     1111111111111111
-      //
-      //     This falls into case 4. We create a carry from R and T,
-      //     propagate into L and F, which then propagates into exponent
-      //     bits:
-      //
-      //    Output:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-      //     S     E E E E E E E E      F F F F F F L
-      //     0     0 0 0 0 0 0 0 1      0 0 0 0 0 0 0
-      //
-      //  Max normal value round to Inf:
-      //    Input:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-      //     0     1 1 1 1 1 1 1 0      1 1 1 1 1 1 1     1111111111111111
-      //
-      //     This falls into case 4. We create a carry from R and T,
-      //     propagate into L and F, which then propagates into exponent
-      //     bits:
-      //
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-      //     S     E E E E E E E E      F F F F F F L
-      //     0     1 1 1 1 1 1 1 1      0 0 0 0 0 0 0
-      //
-      //
-      // Least significant bit of resulting bfloat.
-      uint32_t lsb = (input >> 16) & 1;
-      uint32_t rounding_bias = 0x7fff + lsb;
-      input += rounding_bias;
-      value = static_cast<uint16_t>(input >> 16);
-    }
-  }
-
-  template <class T>
-  explicit EIGEN_DEVICE_FUNC bfloat16(const T& val)
-      : bfloat16(static_cast<float>(val)) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
-    float result;
-
-    uint16_t* q = reinterpret_cast<uint16_t*>(&result);
-
+  EIGEN_DEVICE_FUNC explicit bfloat16(const float v) {
+    const uint16_t* p = reinterpret_cast<const uint16_t*>(&v);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    q[0] = value;
-    q[1] = 0;
+    value = p[0];
 #else
-    q[0] = 0;
-    q[1] = value;
+    value = p[1];
 #endif
-    return result;
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator bool() const {
-    return static_cast<bool>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator Eigen::half() const {
-    return static_cast<Eigen::half>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator short() const {
-    return static_cast<short>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator int() const {
-    return static_cast<int>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator char() const {
-    return static_cast<char>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator signed char() const {
-    return static_cast<signed char>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator unsigned char() const {
-    return static_cast<unsigned char>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator unsigned int() const {
-    return static_cast<unsigned int>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator unsigned long() const {
-    return static_cast<unsigned long>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator unsigned long long() const {
-    return static_cast<unsigned long long>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator long long() const {
-    return static_cast<long long>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator double() const {
-    return static_cast<double>(float(*this));
   }
 
   uint16_t value;
 };
 
-inline bool operator==(const bfloat16 a, const bfloat16 b) {
-  return a.value == b.value;
-}
-
-inline bool operator!=(const bfloat16 a, const bfloat16 b) {
-  return a.value != b.value;
-}
-
 }  // end namespace tensorflow
 
 namespace Eigen {
 template <>
 struct NumTraits<tensorflow::bfloat16> : GenericNumTraits<uint16_t> {};
 
-using ::tensorflow::operator==;
-using ::tensorflow::operator!=;
+EIGEN_STRONG_INLINE bool operator==(const tensorflow::bfloat16 a,
+                                    const tensorflow::bfloat16 b) {
+  return a.value == b.value;
+}
+
 }  // namespace Eigen
 
 #ifdef COMPILER_MSVC
diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index 4bb37e4f6e..c31ab18cc1 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -87,8 +87,7 @@ limitations under the License.
 
 #elif defined(__ANDROID_TYPES_FULL__)
 
-// Only string, half, float, int32, int64, bool, and quantized types
-// supported.
+// Only half, float, int32, int64, bool, and quantized types are supported.
 #define TF_CALL_float(m) m(float)
 #define TF_CALL_double(m)
 #define TF_CALL_int32(m) m(::tensorflow::int32)
@@ -97,7 +96,7 @@ limitations under the License.
 #define TF_CALL_int16(m)
 
 #define TF_CALL_int8(m)
-#define TF_CALL_string(m) m(string)
+#define TF_CALL_string(m)
 #define TF_CALL_resource(m)
 #define TF_CALL_variant(m)
 #define TF_CALL_complex64(m)
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 87c41186d5..fd1b5d33b9 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -453,6 +453,21 @@ const Edge* Graph::AddControlEdge(Node* source, Node* dest,
   return AddEdge(source, kControlSlot, dest, kControlSlot);
 }
 
+void Graph::RemoveControlEdge(const Edge* e) {
+  if (!e->src_->IsSource() && !e->dst_->IsSink()) {
+    e->dst_->MaybeCopyOnWrite();
+    std::string e_src_name = strings::StrCat("^", e->src_->name());
+    auto* inputs = e->dst_->props_->node_def.mutable_input();
+    for (auto it = inputs->begin(); it != inputs->end(); ++it) {
+      if (*it == e_src_name) {
+        inputs->erase(it);
+        break;
+      }
+    }
+  }
+  RemoveEdge(e);
+}
+
 Status Graph::UpdateEdge(Node* new_src, int new_src_index, Node* dst,
                          int dst_index) {
   TF_RETURN_IF_ERROR(IsValidOutputTensor(new_src, new_src_index));
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index c5dde722fa..d0dba6e1f0 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -451,6 +451,11 @@ class Graph {
   // REQUIRES: The edge must exist.
   void RemoveEdge(const Edge* edge);
 
+  // Removes control edge `edge` from the graph. Note that this also updates
+  // the corresponding NodeDef to reflect the change.
+  // REQUIRES: The control edge must exist.
+  void RemoveControlEdge(const Edge* e);
+  
   // Updates the input to a node.  The existing edge to `dst` is removed and an
   // edge from `new_src` to `dst` is created. The NodeDef associated with `dst`
   // is also updated.
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 2ee409768b..753cb260e5 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -68,8 +68,7 @@ class GraphConstructor {
     Options(const GraphConstructorOptions& in)  // NOLINT(runtime/explicit)
         : allow_internal_ops(in.allow_internal_ops),
           expect_device_spec(in.expect_device_spec),
-          importing(false),
-          validate_colocation_constraints(false) {}
+          importing(false) {}
     Options(const ImportGraphDefOptions& in)  // NOLINT(runtime/explicit)
         : allow_internal_ops(false),
           expect_device_spec(false),
@@ -82,8 +81,7 @@ class GraphConstructor {
           control_dependencies(in.control_dependencies),
           return_tensors(in.return_tensors),
           return_nodes(in.return_nodes),
-          importing(true),
-          validate_colocation_constraints(in.validate_colocation_constraints) {}
+          importing(true) {}
 
     bool allow_internal_ops;
     bool expect_device_spec;
@@ -105,7 +103,6 @@ class GraphConstructor {
     // applicable to ConvertGraphDefToGraph as well, so make an attempt to
     // remove this.
     bool importing;
-    bool validate_colocation_constraints;
   };
 
   typedef gtl::ArraySlice<const NodeDef*> NodeDefSlice;
@@ -495,8 +492,7 @@ Status GraphConstructor::InitFromEdges() {
 
 Status GraphConstructor::ValidateColocationConstraints(
     const NodeDef& node_def) {
-  if (!opts_.validate_colocation_constraints || !opts_.importing)
-    return Status::OK();
+  if (!opts_.importing) return Status::OK();
   const auto iter = node_def.attr().find(kColocationAttrName);
   if (iter == node_def.attr().end()) return Status::OK();
   for (const string& c : iter->second.list().s()) {
diff --git a/tensorflow/core/graph/graph_constructor.h b/tensorflow/core/graph/graph_constructor.h
index 4b418b8622..416c0ee9ae 100644
--- a/tensorflow/core/graph/graph_constructor.h
+++ b/tensorflow/core/graph/graph_constructor.h
@@ -119,9 +119,6 @@ struct ImportGraphDefOptions {
   // TODO(skyewm): make this work with `skip_mapped_nodes` if there's a need.
   std::vector<string> return_nodes;
 
-  // If true, checks that all colocation constraints are nodes in the GraphDef.
-  bool validate_colocation_constraints = true;
-
   // TODO(ashankar): Enable handling of GraphDefs produced by newer binaries
   // with ops that are not defined in the binary calling ImportGraphDef.
   // Similar to the producer_op_list argument to import_graph_def in the
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc
index 893826da3e..cd541c7d86 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/graph/graph_constructor_test.cc
@@ -2978,20 +2978,5 @@ versions {
   EXPECT_EQ(17, refiner.graph_def_version());
 }
 
-TEST_F(GraphConstructorTest, ImportGraphDef_ValidateColationConstraints) {
-  GraphDef def;
-  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
-      "node { name: 'A' op: 'TestInput' attr { key: '_class' value { list { "
-      "s:'loc:@missing' } } } }",
-      &def));
-  ImportGraphDefOptions options;
-  // TODO(yaozhang): Extend ExpectError to check error type and use ExpectError
-  // and ExpectOK to replace the code below.
-  Status s = ImportGraphDef(options, def, &graph_, nullptr);
-  EXPECT_TRUE(errors::IsInvalidArgument(s)) << s;
-  options.validate_colocation_constraints = false;
-  TF_EXPECT_OK(ImportGraphDef(options, def, &graph_, nullptr));
-}
-
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index b9e3cba035..1924c05d3d 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -117,7 +117,7 @@ DataType EdgeType(const Edge* e) {
   }
 }
 
-// Return true iff we need to add a same device send/recv for 'edge'.
+// Return true iff we need to add the same device send/recv for 'edge'.
 bool NeedSameDeviceSendRecv(const Edge* edge, const GraphInfo& info) {
   if (edge->IsControlEdge()) {
     return false;
@@ -1116,7 +1116,7 @@ Status Partition(const PartitionOptions& opts, Graph* g,
         // before the data is available.
         AddInput(real_recv, send->name(), Graph::kControlSlot);
       } else if (control_flow_edge != nullptr) {
-        // Redirect control edge to the real recv since this is not a same
+        // Redirect control edge to the real recv since this is not the same
         // device send/recv.
         --num_control_flow_edges;
         AddInput(real_recv, control_flow_edge->src()->name(),
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index e5d57facaa..d1c89a48bd 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -118,6 +118,25 @@ class GraphTest : public ::testing::Test {
     LOG(FATAL) << name;
   }
 
+  bool ControlEdgeExistsInGraphOrNodeDef(const Node* src,
+                                         const Node* dst) {
+    for (const Edge *e : dst->in_edges()) {
+      if (e->IsControlEdge() &&
+          e->src() == src &&
+          e->src_output() == Graph::kControlSlot &&
+          e->dst_input() == Graph::kControlSlot) {
+        return true;
+      }
+    }
+    std::string control_edge_name = strings::StrCat("^", src->name());
+    for (int i = 0; i < dst->def().input_size(); ++i) {
+      if (dst->def().input(i) == control_edge_name) {
+        return true;
+      }
+    }
+    return false;
+  }
+
   Graph graph_;
 
  private:
@@ -458,8 +477,8 @@ TEST_F(GraphTest, AddControlEdge) {
   EXPECT_TRUE(edge == nullptr);
   EXPECT_EQ(b->def().input_size(), 2);
 
-  // Can add redundant control edge with create_duplicate.
-  edge = graph_.AddControlEdge(a, b, /*create_duplicate=*/true);
+  // Can add redundant control edge with allow_duplicates.
+  edge = graph_.AddControlEdge(a, b, /*allow_duplicates=*/true);
   EXPECT_TRUE(edge != nullptr);
   // create_duplicate causes the NodeDef not to be updated.
   ASSERT_EQ(b->def().input_size(), 2);
@@ -477,6 +496,47 @@ TEST_F(GraphTest, AddControlEdge) {
   EXPECT_EQ(b->def().input_size(), 2);
 }
 
+TEST_F(GraphTest, RemoveControlEdge) {
+  FromGraphDef(
+      "node { name: 'A' op: 'OneOutput' }"
+      "node { name: 'B' op: 'OneInputTwoOutputs' input: [ 'A:0' ] }"
+      "node { name: 'C' op: 'NoOp' } ");
+  Node* a = FindNode("A");
+  Node* b = FindNode("B");
+  Node* c = FindNode("C");
+
+  // Add a control edge.
+  const Edge* edge_1 = graph_.AddControlEdge(c, a);
+  const Edge* edge_2 = graph_.AddControlEdge(a, b);
+  ASSERT_TRUE(edge_1 != nullptr);
+  ASSERT_TRUE(edge_2 != nullptr);
+
+  ASSERT_TRUE(ControlEdgeExistsInGraphOrNodeDef(c, a));
+  ASSERT_TRUE(ControlEdgeExistsInGraphOrNodeDef(a, b));
+
+  graph_.RemoveControlEdge(edge_1);
+  ASSERT_TRUE(!ControlEdgeExistsInGraphOrNodeDef(c, a));
+  ASSERT_TRUE(ControlEdgeExistsInGraphOrNodeDef(a, b));
+
+  graph_.RemoveControlEdge(edge_2);
+  ASSERT_TRUE(!ControlEdgeExistsInGraphOrNodeDef(c, a));
+  ASSERT_TRUE(!ControlEdgeExistsInGraphOrNodeDef(a, b));
+
+  // Test removing a duplicate control edge.
+  // Note that unless allow_duplicates is true, the duplicate edge
+  // will not be added. That's why we expect edge_4 to be a null
+  // pointer. We are not testing with allow_duplicates set to true,
+  // as that is a highly unlikely use case that does not make much
+  // sense.
+  const Edge* edge_3 = graph_.AddControlEdge(c, a);
+  const Edge* edge_4 = graph_.AddControlEdge(c, a);
+  ASSERT_TRUE(edge_3 != nullptr);
+  ASSERT_TRUE(edge_4 == nullptr);
+
+  graph_.RemoveControlEdge(edge_3);
+  ASSERT_TRUE(!ControlEdgeExistsInGraphOrNodeDef(c, a));
+}
+
 TEST_F(GraphTest, UpdateEdge) {
   // Build a little graph
   Node* a = FromNodeDef("A", "OneOutput", 0);
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index cb32d64334..880e4e712e 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -21,107 +21,108 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
-// Since our ops are going to produce and also consume N addition tensors
-// (Mkl) for N Tensorflow tensors, we can have following different
-// orderings among these 2N tensors.
-//
-// E.g., for Tensorflow tensors A, B, and C, our ops will produce and
-// consume A_m, B_m, and C_m additionally.
-//
-// INTERLEAVED: in this case 2N tensors are interleaved. So for above
-//              example, the ordering looks like: A, A_m, B, B_m, C, C_m.
-//
-// CONTIGUOUS: in thi case N Tensorflow tensors are contiguous followed
-//             by N Mkl tensors. So for above example, the ordering looks
-//             like: A, B, C, A_m, B_m, C_m
-//
-// Following APIs map index of original Tensorflow tensors to their
-// appropriate position based on selected ordering. For contiguous ordering,
-// we need to know the total number of tensors (parameter total).
-//
-typedef enum { TENSORS_INTERLEAVED, TENSORS_CONTIGUOUS } MklTfTensorOrdering;
-// NOTE: Currently, we use contiguous ordering. If you change this, then you
-// would need to change Mkl op definitions in nn_ops.cc.
-static MklTfTensorOrdering kTensorOrdering = TENSORS_CONTIGUOUS;
+  // Since our ops are going to produce and also consume N addition tensors
+  // (Mkl) for N Tensorflow tensors, we can have following different
+  // orderings among these 2N tensors.
+  //
+  // E.g., for Tensorflow tensors A, B, and C, our ops will produce and
+  // consume A_m, B_m, and C_m additionally.
+  //
+  // INTERLEAVED: in this case 2N tensors are interleaved. So for above
+  //              example, the ordering looks like: A, A_m, B, B_m, C, C_m.
+  //
+  // CONTIGUOUS: in thi case N Tensorflow tensors are contiguous followed
+  //             by N Mkl tensors. So for above example, the ordering looks
+  //             like: A, B, C, A_m, B_m, C_m
+  //
+  // Following APIs map index of original Tensorflow tensors to their
+  // appropriate position based on selected ordering. For contiguous ordering,
+  // we need to know the total number of tensors (parameter total).
+  //
+  typedef enum { TENSORS_INTERLEAVED, TENSORS_CONTIGUOUS } MklTfTensorOrdering;
+  // NOTE: Currently, we use contiguous ordering. If you change this, then you
+  // would need to change Mkl op definitions in nn_ops.cc.
+  static MklTfTensorOrdering kTensorOrdering = TENSORS_CONTIGUOUS;
 
-// Get index of MetaData tensor from index 'n' of Data tensor.
-inline int DataIndexToMetaDataIndex(int n, int total_tensors) {
-  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
-    // For interleaved ordering, Mkl tensor follows immediately after
-    // Tensorflow tensor.
-    return n + 1;
-  } else {
-    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-    // For contiguous ordering, Mkl tensor is n+total_tensors / 2 away.
-    return n + total_tensors / 2;
+  // Get index of MetaData tensor from index 'n' of Data tensor.
+  inline int DataIndexToMetaDataIndex(int n, int total_tensors) {
+    if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+      // For interleaved ordering, Mkl tensor follows immediately after
+      // Tensorflow tensor.
+      return n + 1;
+    } else {
+      CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+      // For contiguous ordering, Mkl tensor is n+total_tensors / 2 away.
+      return n + total_tensors / 2;
+    }
   }
-}
 
-int inline GetTensorDataIndex(int n, int total_tensors) {
-  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
-    return 2 * n;  // index corresponding to nth input/output tensor
-  } else {
-    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-    return n;
-  }
-}
+  int inline GetTensorDataIndex(int n, int total_tensors) {
+      if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+        return 2 * n;  // index corresponding to nth input/output tensor
+      } else {
+        CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+        return n;
+      }
+    }
 
-int inline GetTensorMetaDataIndex(int n, int total_tensors) {
-  // Get index for TensorData first and then use mapping function
-  // to get TensorMetaData index from TensorData index.
-  int tidx = GetTensorDataIndex(n, total_tensors);
-  return DataIndexToMetaDataIndex(tidx, total_tensors);
-}
+  int inline GetTensorMetaDataIndex(int n, int total_tensors) {
+      // Get index for TensorData first and then use mapping function
+      // to get TensorMetaData index from TensorData index.
+      int tidx = GetTensorDataIndex(n, total_tensors);
+      return DataIndexToMetaDataIndex(tidx, total_tensors);
+    }
 
 namespace mkl_op_registry {
-static const char* kMklOpLabel = "MklOp";
-static const char* kMklOpLabelPattern = "label='MklOp'";
-
-// Get the name of Mkl op from original TensorFlow op
-// We prefix 'Mkl' to the original op to get Mkl op.
-inline string GetMklOpName(const string& name) {
-  // Prefix that we add to Tensorflow op name to construct Mkl op name.
-  const char* const kMklOpPrefix = "_Mkl";
-  return string(kMklOpPrefix) + name;
-}
+  static const char* kMklOpLabel = "MklOp";
+  static const char* kMklOpLabelPattern = "label='MklOp'";
 
-// Check whether opname with type T is registered as MKL-compliant.
-//
-// @input: name of the op
-// @input: T datatype to be used for checking op
-// @return: true if opname is registered as Mkl op; false otherwise
-static inline bool IsMklOp(const std::string& op_name, DataType T) {
-  string kernel = KernelsRegisteredForOp(op_name);
-  bool result =
-      kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT);
-  if (result) {
-    VLOG(1) << "mkl_op_registry::" << op_name << " is " << kMklOpLabel;
+  // Get the name of Mkl op from original TensorFlow op
+  // We prefix 'Mkl' to the original op to get Mkl op.
+  inline string GetMklOpName(const string& name) {
+    // Prefix that we add to Tensorflow op name to construct Mkl op name.
+    const char* const kMklOpPrefix = "_Mkl";
+    return string(kMklOpPrefix) + name;
   }
-  return result;
-}
 
-// Check whether opname with type T is registered as MKL-compliant and
-// is element-wise.
-//
-// @input: name of the op
-// @input: T datatype to be used for checking op
-// @return: true if opname is registered as element-wise Mkl op;
-// false otherwise
-static inline bool IsMklElementWiseOp(const std::string& op_name, DataType T) {
-  if (!IsMklOp(op_name, T)) {
-    return false;
+  // Check whether opname with type T is registered as MKL-compliant.
+  //
+  // @input: name of the op
+  // @input: T datatype to be used for checking op
+  // @return: true if opname is registered as Mkl op; false otherwise
+  static inline bool IsMklOp(const std::string& op_name, DataType T) {
+    string kernel = KernelsRegisteredForOp(op_name);
+    bool result =
+        kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT);
+    if (result) {
+      VLOG(1) << "mkl_op_registry::" << op_name << " is " << kMklOpLabel;
+    }
+    return result;
   }
 
-  bool result = (0 == op_name.compare(GetMklOpName("Add")) ||
-                 0 == op_name.compare(GetMklOpName("Sub")) ||
-                 0 == op_name.compare(GetMklOpName("Mul")) ||
-                 0 == op_name.compare(GetMklOpName("Maximum")) ||
-                 0 == op_name.compare(GetMklOpName("SquaredDifference")));
+  // Check whether opname with type T is registered as MKL-compliant and
+  // is element-wise.
+  //
+  // @input: name of the op
+  // @input: T datatype to be used for checking op
+  // @return: true if opname is registered as element-wise Mkl op;
+  // false otherwise
+  static inline bool IsMklElementWiseOp(const std::string& op_name,
+    DataType T) {
+    if (!IsMklOp(op_name, T)) {
+      return false;
+    }
 
-  VLOG(1) << "mkl_op_registry::" << op_name
-          << " is elementwise MKL op: " << result;
-  return result;
-}
+    bool result = (0 == op_name.compare(GetMklOpName("Add")) ||
+                    0 == op_name.compare(GetMklOpName("Sub")) ||
+                    0 == op_name.compare(GetMklOpName("Mul")) ||
+                    0 == op_name.compare(GetMklOpName("Maximum")) ||
+                    0 == op_name.compare(GetMklOpName("SquaredDifference")));
+
+    VLOG(1) << "mkl_op_registry::" << op_name
+            << " is elementwise MKL op: " << result;
+    return result;
+  }
 }  // namespace mkl_op_registry
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index f4c9073dee..912075aa28 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -37,8 +37,8 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/graph/mkl_layout_pass.h"
+#include "tensorflow/core/graph/mkl_graph_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.cc b/tensorflow/core/graph/mkl_tfconversion_pass.cc
index fe4588389e..599bb88f01 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc
@@ -33,8 +33,8 @@ limitations under the License.
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
 
-#include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/graph/mkl_tfconversion_pass.h"
+#include "tensorflow/core/graph/mkl_graph_util.h"
 
 namespace tensorflow {
 
@@ -68,7 +68,7 @@ namespace tensorflow {
 // take place before we hit the op. For this, we add a new op before each
 // element-wise MKL op to deal with the inputs, called _MklInputConversion.
 // This pass has been enhanced to add this capability.
-// 
+//
 // The _MklInputConversion op will check the inputs to the elementwise op and
 // make sure that either both are in MKL format or both are in TF format,
 // depending on their initial state and whether broadcast is needed or not.
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 35048a4fcf..44322a2d8c 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -50,9 +50,13 @@ template <typename Handle>
 struct HandleToObject {};
 template <>
 struct HandleToObject<ShapeHandle> {
-  typedef ShapeHandle Object;
+  typedef TensorShapeProto Object;
 
-  static ShapeHandle Unknown() { return ShapeHandle(); }
+  static TensorShapeProto Unknown() {
+    TensorShapeProto result;
+    result.set_unknown_rank(true);
+    return result;
+  }
 };
 
 template <>
@@ -63,24 +67,13 @@ struct HandleToObject<DimensionHandle> {
 };
 
 template <typename Handle>
-struct Processor {};
-
-template <>
-struct Processor<ShapeHandle> {
+struct Processor {
   // Extract the shape or dim denoted by the handle.
-  void ExtractValue(ShapeHandle h, ShapeHandle* result) { *result = h; }
+  void ExtractValue(Handle /*t1*/,
+                    typename HandleToObject<Handle>::Object* result) {}
   // Merge the shapes or dims.
-  Status Merge(ShapeHandle h1, ShapeHandle h2, ShapeHandle* result) {
-    if (InferenceContext::RankKnown(*result)) {
-      // The result was initialized in a previous merge to a shape of known
-      // rank, make sure we preserve that information.
-      return Status::OK();
-    }
-    if (InferenceContext::RankKnown(h1)) {
-      *result = h1;
-    } else {
-      *result = h2;
-    }
+  Status Merge(Handle /*t1*/, Handle /*t2*/,
+               typename HandleToObject<Handle>::Object* result) {
     return Status::OK();
   }
 };
@@ -108,37 +101,24 @@ struct Processor<DimensionHandle> {
 
     if (dim1 >= 0 && dim2 >= 0) {
       CHECK_EQ(dim1, dim2);
-      return RefineDim(dim1, result);
+      *result = dim1;
     } else if (dim1 >= 0 && dim2 < 0) {
-      return RefineDim(dim1, result);
+      *result = dim1;
     } else if (dim1 < 0 && dim2 >= 0) {
-      return RefineDim(dim2, result);
+      *result = dim2;
     } else if (dim1 < -1) {
-      return RefineDim(dim1, result);
+      *result = dim1;
     } else if (dim2 < -1) {
-      return RefineDim(dim2, result);
+      *result = dim2;
     } else {
       CHECK_EQ(dim1, dim2);
       CHECK_EQ(-1, dim1);
-      return RefineDim(-1, result);
+      *result = -1;
     }
     return Status::OK();
   }
 
  private:
-  Status RefineDim(int64 dim, int64* result) {
-    if (*result >= 0) {
-      if (!(*result == dim || dim < 0)) {
-        return errors::InvalidArgument("Inconsistent dimensions detected");
-      }
-    } else if (dim >= 0) {
-      *result = dim;
-    } else if (dim < *result) {
-      *result = dim;
-    }
-    return Status::OK();
-  }
-
   int64 counter = 2;
 };
 
@@ -374,17 +354,18 @@ class SymbolicShapeManager {
     return dims_.Merge(d1, d2);
   }
 
+  int64 Value(DimensionHandle d) { return dims_.GetMergedValue(d); }
+
   void AsTensorProperties(const ShapeHandle& shape, const DataType& type,
+                          InferenceContext* ctx,
                           OpInfo::TensorProperties* properties) {
     properties->set_dtype(type);
-    ShapeHandle actual_shape = shapes_.GetMergedValue(shape);
-    if (!InferenceContext::RankKnown(actual_shape)) {
+    if (!ctx->RankKnown(shape)) {
       properties->mutable_shape()->set_unknown_rank(true);
     } else {
-      for (int j = 0; j < InferenceContext::Rank(actual_shape); ++j) {
-        shape_inference::DimensionHandle dim =
-            InferenceContext::DimKnownRank(actual_shape, j);
-        int64 d = dims_.GetMergedValue(dim);
+      for (int j = 0; j < ctx->Rank(shape); ++j) {
+        shape_inference::DimensionHandle dim = ctx->Dim(shape, j);
+        int64 d = Value(dim);
         properties->mutable_shape()->add_dim()->set_size(d);
       }
     }
@@ -466,11 +447,6 @@ Status GraphProperties::InferStatically() {
   shape_refiner.set_disable_constant_propagation(true);
   shape_refiner.set_function_library_for_shape_inference(&function_library);
   ImportGraphDefOptions options;
-  // Graph optimization happens at the late stage of graph execution,
-  // when colocation constraints are already validated previously and
-  // the device placement of nodes has also completed, so there
-  // is no need to validate colocation constraints again.
-  options.validate_colocation_constraints = false;
   Status s = ImportGraphDef(options, item_.graph, &graph, &shape_refiner);
   TF_RETURN_IF_ERROR(s);
 
@@ -496,6 +472,41 @@ Status GraphProperties::InferStatically() {
         }
       }
     }
+
+    // Infer output shape for Restore op.
+    if (node->op_def().name() == "Restore" ||
+        node->op_def().name() == "RestoreV2" ||
+        node->op_def().name() == "RestoreSlice") {
+      auto ctx = shape_refiner.GetContext(node);
+      for (const Edge* out_edge : node->out_edges()) {
+        const Node* output = out_edge->dst();
+        int output_idx = out_edge->src_output();
+        if (output_idx < 0) {
+          continue;
+        }
+        if (!ctx->FullyDefined(ctx->output(output_idx)) &&
+            output->op_def().name() == "Assign") {
+          if (!output->attrs().Find("validate_shape") ||
+              !output->attrs().Find("validate_shape")->b()) {
+            continue;
+          }
+          auto output_ctx = shape_refiner.GetContext(output);
+          if (output_ctx->FullyDefined(output_ctx->output(0))) {
+            ctx->set_output(output_idx, output_ctx->output(0));
+            output_ctx->MergeInput(1, output_ctx->output(0));
+          } else {
+            const Node* var;
+            TF_CHECK_OK(node->input_node(0, &var));
+            if (node->IsVariable()) {
+              auto var_ctx = shape_refiner.GetContext(var);
+              CHECK(var_ctx->FullyDefined(var_ctx->output(0)));
+              ctx->set_output(output_idx, var_ctx->output(0));
+              output_ctx->MergeInput(1, var_ctx->output(0));
+            }
+          }
+        }
+      }
+    }
   }
 
   // Propagate the initial shapes of Enter nodes manually (the Enter shape
@@ -628,6 +639,8 @@ Status GraphProperties::InferStatically() {
     } while (!done);
   }
 
+  std::unordered_map<const shape_inference::Dimension*, int> dim_ids;
+
   // Track shapes globally accross the graph.
   SymbolicShapeManager shape_manager;
   bool found_error = false;
@@ -675,7 +688,7 @@ Status GraphProperties::InferStatically() {
       input_properties.resize(ctx->num_inputs());
       for (int i = 0; i < ctx->num_inputs(); ++i) {
         shape_manager.AsTensorProperties(ctx->input(i), node->input_type(i),
-                                         &input_properties[i]);
+                                         ctx, &input_properties[i]);
       }
       for (const auto& edge : node->in_edges()) {
         if (!edge->src()->IsConstant()) {
@@ -702,7 +715,7 @@ Status GraphProperties::InferStatically() {
       output_properties.resize(ctx->num_outputs());
       for (int i = 0; i < ctx->num_outputs(); ++i) {
         shape_manager.AsTensorProperties(ctx->output(i), node->output_type(i),
-                                         &output_properties[i]);
+                                         ctx, &output_properties[i]);
       }
     }
   }
diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h
index a6aed0bba6..e2fe9f9689 100644
--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -55,6 +55,12 @@ class GraphProperties {
   const std::vector<OpInfo::TensorProperties>& GetOutputProperties(
       const string& node_name) const;
 
+  static void FillTensorPropertiesFromContext(
+      const shape_inference::ShapeHandle&, const DataType&,
+      shape_inference::InferenceContext*,
+      std::unordered_map<const shape_inference::Dimension*, int>* dim_ids,
+      OpInfo::TensorProperties*);
+
  private:
   // Inputs
   GrapplerItem item_;
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index f785f627e1..a33cdacc09 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/grappler/inputs/utils.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -296,9 +295,10 @@ TEST_F(GraphPropertiesTest, Queues) {
   ASSERT_EQ(1, props2.size());
   EXPECT_EQ("float: [3,7]", PropToString(props2[0]));
 
+  // The dequeue3 op shape is unknown.
   const auto props3 = properties.GetOutputProperties("Dequeue3");
   ASSERT_EQ(1, props3.size());
-  EXPECT_EQ("float: [3,7]", PropToString(props3[0]));
+  EXPECT_EQ("float: ?", PropToString(props3[0]));
 
   // The dequeue3 op shape is unknown. The square2 op shape is known. Verify
   // that we merge the 2 properly to determine the shape of the data coming out
@@ -677,8 +677,8 @@ TEST_F(GraphPropertiesTest, InferRestoreOpShape) {
 
 TEST_F(GraphPropertiesTest, InferRestoreOpShape_WithTwoNodesShareSameOutput) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output var = ops::Variable(s.WithOpName("var"), PartialTensorShape(),
-                             DataType::DT_FLOAT);
+  Output var =
+      ops::Variable(s.WithOpName("var"), TensorShape(), DataType::DT_FLOAT);
   Output var2 = ops::Variable(s.WithOpName("var2"), TensorShape({128, 256}),
                               DataType::DT_FLOAT);
   Output filename =
@@ -784,30 +784,6 @@ TEST_F(GraphPropertiesTest, SymbolicShapes) {
   EXPECT_EQ(shape_f.dim(1).size(), shape_a.dim(1).size());
 }
 
-TEST_F(GraphPropertiesTest, DoNotValidateColocationConstraints) {
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output a = ops::Const(s.WithOpName("a"), 1.0f, {1});
-  Output b = ops::Const(s.WithOpName("b"), 2.0f, {1});
-  Output c = ops::Const(s.WithOpName("c").ColocateWith(a), 3.0f, {1});
-  GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  // Create a graph with node a removed (say by some graph optimization
-  // pass), noting that node c is colocated with a. This is fine as it
-  // is in the late stage of graph execution, the colocation constraints have
-  // been validated previously and the device placement of nodes has completed.
-  GraphDef optimized_graph;
-  for (const auto& node : item.graph.node()) {
-    if (node.name() != "a") {
-      *optimized_graph.add_node() = node;
-    }
-  }
-  item.graph.Swap(&optimized_graph);
-  GraphProperties properties(item);
-  // This function should return OK, since it doesn't validate the colocation
-  // constraints internally.
-  TF_EXPECT_OK(properties.InferStatically());
-}
-
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 54004a5e07..669d02815c 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -112,7 +112,6 @@ tf_cc_test(
     deps = [
         ":constant_folding",
         "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index f2277a9b79..38af7170b5 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -185,6 +185,10 @@ bool IsInnerMatrixTransposeNode(const NodeDef& transpose_node,
   return false;
 }
 
+bool SimplyReordersData(const NodeDef& node) {
+  return node.op() == "Transpose";
+}
+
 // Follow a chain (through input(0)) of ops starting at `source->input(0)` as
 // long as they
 //  1. preserve the values of their first input,
@@ -703,6 +707,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
           node_map->AddOutput(new_transpose->name(), new_cast->name());
 
           new_nodes->push_back(new_transpose);
+          new_nodes->push_back(new_cast);
           //  Add frame dependencies that the original node might have had.
           AddFrameControlDeps(node, {new_transpose, new_cast},
                               new_transpose->input(0), {new_transpose},
@@ -832,7 +837,8 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     }
   }
 
-  if (node->input_size() > 0 && IsAggregate(*node)) {
+  if (node->input_size() > 0 && IsAggregate(*node) &&
+      !node_map->GetOutputs(node->name()).empty()) {
     // Discard aggregate nodes with a single input.
     if (node->input_size() == 1) {
       return node->input(0);
@@ -853,7 +859,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
         break;
       }
     }
-    if (all_equal && node_map->GetNode(node->name() + "_const") == nullptr) {
+    if (all_equal) {
       // 1. Create constant node with value N.
       const int N = node->input_size();
       const auto type = GetDataTypeFromAttr(*node, "T");
@@ -879,6 +885,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
       new_mul_node->set_device(node->device());
       SetDataTypeToAttr(type, "T", new_mul_node);
       node_map->AddNode(new_mul_node->name(), new_mul_node);
+      new_nodes->push_back(new_mul_node);
       new_mul_node->add_input(new_const_node->name());
       node_map->AddOutput(new_const_node->name(), new_mul_node->name());
       new_mul_node->add_input(node->input(0));
@@ -895,7 +902,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
   // where all the inputs are Mul nodes. This pattern occurs frequently in
   // regularization terms for the gradients during training.
   if (node->input_size() > 1 && IsAggregate(*node) &&
-      node_map->GetNode(node->name() + "_hoist") == nullptr) {
+      !node_map->GetOutputs(node->name()).empty()) {
     // Determine the set of common factors if the input nodes are all Mul nodes.
     std::set<string> common_factors;
     int i = 0;
@@ -943,6 +950,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
           new_mul_node->set_name(new_mul_node->name() + "_hoist");
           new_mul_node->set_input(0, common_factor);
           new_mul_node->set_input(1, new_add_node->name());
+          new_nodes->push_back(new_mul_node);
           node_map->AddNode(new_mul_node->name(), new_mul_node);
         }
       }
@@ -1007,9 +1015,8 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
   }
 
   // Fold Conj into Transpose or ConjugateTranspose.
-  if ((node->op() == "Conj" || node->op() == "Transpose" ||
-       node->op() == "ConjugateTranspose") &&
-      node_map->GetNode(node->name() + "_fused") == nullptr) {
+  if (node->op() == "Conj" || node->op() == "Transpose" ||
+      node->op() == "ConjugateTranspose") {
     const NodeDef* input = node_map->GetNode(node->input(0));
     const NodeDef* transpose_op = node->op() == "Conj" ? input : node;
     const NodeDef* conj_op = node->op() == "Conj" ? node : input;
@@ -1042,14 +1049,10 @@ namespace {
 template <class T>
 class SetVector {
  public:
-  // Returns false if value already existed in the set, true otherwise.
-  bool PushBack(const T& value) {
-    if (!set_.insert(value).second) {
-      VLOG(2) << "Value " << value << " is already in the set.";
-      return false;
-    }
+  void PushBack(const T& value) {
+    CHECK(!Exists(value)) << "Value " << value << " is already in the set.";
+    set_.insert(value);
     vector_.push_back(value);
-    return true;
   }
 
   T PopBack() {
@@ -1090,11 +1093,6 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(
     }
 
     if (NodeName(simplified_tensor) != node->name()) {
-      // Always consider simplified_tensor for further optimizations.
-      const NodeDef* simplified_node = node_map.GetNode(simplified_tensor);
-      if (simplified_node != nullptr) {
-        nodes_to_simplify.PushBack(simplified_node);
-      }
       // When `node` is simplifed to another node rather than in-place, the
       // consumers of `node` are already redirected to `simplified_tensor`.
       // Re-push the consumers into `nodes_to_simplify` for further
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index 60fb47f51a..9f471302c7 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -38,8 +38,8 @@ TEST_F(ArithmeticOptimizerTest, NoOp) {
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  Status s = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(s);
 
   EXPECT_EQ(item.graph.node_size(), output.node_size());
   for (int i = 0; i < item.graph.node_size(); ++i) {
@@ -66,10 +66,6 @@ TEST_F(ArithmeticOptimizerTest, OpDedupping) {
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
 
   EXPECT_EQ(2, output.node_size());
   const NodeDef& new_c1 = output.node(0);
@@ -95,10 +91,6 @@ TEST_F(ArithmeticOptimizerTest, OpDedupCommutative) {
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
 
   EXPECT_EQ(4, output.node_size());
   const NodeDef& new_c1 = output.node(0);
@@ -154,17 +146,13 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithChain) {
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
 
   EXPECT_EQ(6, output.node_size());
   EXPECT_EQ("squeeze", output.node(5).input(0));
   EXPECT_EQ("c", output.node(2).input(0));
 }
 
-TEST_F(ArithmeticOptimizerTest, TrivialSumsSimple) {
+TEST_F(ArithmeticOptimizerTest, SimplifyReplaceTrivialSums) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
   Output add = ops::Add(s.WithOpName("add"), x, x);
@@ -177,10 +165,6 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsSimple) {
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
 
   EXPECT_EQ(5, output.node_size());
   const NodeDef& new_const = output.node(3);
@@ -194,61 +178,7 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsSimple) {
   EXPECT_EQ("add_mul", new_id.input(0));
 }
 
-TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
-  // Test case from b/69059093.
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output p = ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({10, 10}));
-  Output add = ops::Add(s.WithOpName("Add"), p, p);
-  Output add1 = ops::Add(s.WithOpName("Add_1"), p, p);
-  Output add4 = ops::Add(s.WithOpName("Add_4"), add, add1);
-  Output add5 = ops::Add(s.WithOpName("Add_5"), add, add1);
-  Output add6 = ops::Add(s.WithOpName("Add_6"), add4, add5);
-  Output id = ops::Identity(s.WithOpName("id"), add6);
-
-  GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  ArithmeticOptimizer optimizer;
-  GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-
-  EXPECT_EQ(11, output.node_size());
-  const NodeDef& new_id = output.node(4);
-  EXPECT_EQ("id", new_id.name());
-  EXPECT_EQ("Add_6_mul", new_id.input(0));
-
-  // Add4 and add5 get deduped, and we rewrite each of the 3 remaining add nodes
-  // of the form Add(x,x) into Mul(Const(2), x).
-  const NodeDef& new_add_4_const = output.node(5);
-  EXPECT_EQ("Add_4_const", new_add_4_const.name());
-  EXPECT_EQ("^Add", new_add_4_const.input(0));
-  const NodeDef& new_add_4_mul = output.node(6);
-  EXPECT_EQ("Add_4_mul", new_add_4_mul.name());
-  EXPECT_EQ("Add_4_const", new_add_4_mul.input(0));
-  EXPECT_EQ("Add_mul", new_add_4_mul.input(1));
-
-  const NodeDef& new_add_6_const = output.node(7);
-  EXPECT_EQ("Add_6_const", new_add_6_const.name());
-  EXPECT_EQ("^Add_4_mul", new_add_6_const.input(0));
-  const NodeDef& new_add_6_mul = output.node(8);
-  EXPECT_EQ("Add_6_mul", new_add_6_mul.name());
-  EXPECT_EQ("Add_6_const", new_add_6_mul.input(0));
-  EXPECT_EQ("Add_4_mul", new_add_6_mul.input(1));
-
-  const NodeDef& new_add_const = output.node(9);
-  EXPECT_EQ("Add_const", new_add_const.name());
-  EXPECT_EQ("^Placeholder", new_add_const.input(0));
-  const NodeDef& new_add_mul = output.node(10);
-  EXPECT_EQ("Add_mul", new_add_mul.name());
-  EXPECT_EQ("Add_const", new_add_mul.input(0));
-  EXPECT_EQ("Placeholder", new_add_mul.input(1));
-}
-
-TEST_F(ArithmeticOptimizerTest, HoistFactor) {
+TEST_F(ArithmeticOptimizerTest, SimplifyHoistFactor) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
   Output y1 = ops::Const(s.WithOpName("y1"), {3.0f, 4.0f}, {1, 2});
@@ -265,10 +195,6 @@ TEST_F(ArithmeticOptimizerTest, HoistFactor) {
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
 
   EXPECT_EQ(9, output.node_size());
   const NodeDef& new_add = output.node(8);
@@ -299,10 +225,6 @@ TEST_F(ArithmeticOptimizerTest, FuseConjAndTranspose) {
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
 
   EXPECT_EQ(7, output.node_size());
   EXPECT_EQ("trans_fused", output.node(6).name());
@@ -350,10 +272,6 @@ TEST_F(ArithmeticOptimizerTest, FuseTransposeAndConj) {
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
 
   EXPECT_EQ(7, output.node_size());
   EXPECT_EQ("conj_fused", output.node(6).name());
@@ -386,10 +304,6 @@ TEST_F(ArithmeticOptimizerTest, FoldTransposeIntoMatMul) {
     GraphDef output;
     Status status = optimizer.Optimize(nullptr, item, &output);
     TF_EXPECT_OK(status);
-    // Run the optimizer twice to make sure the rewrite is idempotent.
-    item.graph.Swap(&output);
-    status = optimizer.Optimize(nullptr, item, &output);
-    TF_EXPECT_OK(status);
 
     EXPECT_EQ(7, output.node_size());
     EXPECT_EQ("matmul_fused", output.node(6).name());
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 02a732b092..cb02314183 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/public/version.h"
-#include "tensorflow/core/util/bcast.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -96,15 +95,11 @@ class DeviceSimple : public DeviceBase {
 };
 
 }  // namespace
-ConstantFolding::ConstantFolding(RewriterConfig::Toggle opt_level,
-                                 DeviceBase* cpu_device)
-    : opt_level_(opt_level), cpu_device_(cpu_device) {
+ConstantFolding::ConstantFolding(DeviceBase* cpu_device)
+    : cpu_device_(cpu_device) {
   resource_mgr_.reset(new ResourceMgr());
 }
 
-ConstantFolding::ConstantFolding(DeviceBase* cpu_device)
-    : ConstantFolding(RewriterConfig::ON, cpu_device) {}
-
 // static
 string ConstantFolding::AddControlDependency(const string& input_name,
                                              GraphDef* graph,
@@ -286,149 +281,6 @@ Status ConstantFolding::MaterializeShapes(const GrapplerItem& item,
   return Status::OK();
 }
 
-bool ShapesEqual(const TensorShapeProto& shape1,
-                 const TensorShapeProto& shape2) {
-  if (shape1.unknown_rank() || shape2.unknown_rank()) {
-    return false;
-  }
-  if (shape1.dim_size() != shape2.dim_size()) {
-    return false;
-  }
-  for (int i = 0; i < shape1.dim_size(); ++i) {
-    if (shape1.dim(i).size() != shape2.dim(i).size()) {
-      return false;
-    }
-  }
-  return true;
-}
-
-namespace {
-bool ExtractShape(const NodeDef& shape_node, const GraphProperties& properties,
-                  BCast::Vec* shape, int64* min_id) {
-  if (shape_node.op() == "Shape") {
-    const std::vector<OpInfo::TensorProperties>& prop1 =
-        properties.GetInputProperties(shape_node.name());
-    if (prop1.size() != 1) {
-      return false;
-    }
-    const TensorShapeProto& shp = prop1[0].shape();
-    if (shp.unknown_rank()) {
-      return false;
-    }
-    for (const auto& dim : shp.dim()) {
-      shape->push_back(dim.size());
-      *min_id = std::min<int64>(*min_id, dim.size());
-    }
-  } else {
-    const TensorProto& raw_val = shape_node.attr().at("value").tensor();
-    if (raw_val.dtype() != DT_INT64 && raw_val.dtype() != DT_INT32) {
-      return false;
-    }
-    Tensor value(raw_val.dtype(), raw_val.tensor_shape());
-    if (!value.FromProto(raw_val)) {
-      return false;
-    }
-    for (int j = 0; j < value.NumElements(); ++j) {
-      if (raw_val.dtype() == DT_INT64) {
-        shape->push_back(value.vec<int64>()(j));
-      } else {
-        shape->push_back(value.vec<int>()(j));
-      }
-    }
-  }
-  return true;
-}
-}  // namespace
-
-Status ConstantFolding::MaterializeConstants(
-    const GrapplerItem& item, const GraphProperties& properties) {
-  const int node_count = graph_.node_size();
-  for (int i = 0; i < node_count; ++i) {
-    NodeDef& node = *graph_.mutable_node(i);
-    const string& op = node.op();
-    if (op != "BroadcastGradientArgs") {
-      continue;
-    }
-    const NodeDef* shape_node1 = node_map_->GetNode(node.input(0));
-    const NodeDef* shape_node2 = node_map_->GetNode(node.input(1));
-    if (shape_node1 == nullptr ||
-        (shape_node1->op() != "Shape" && shape_node1->op() != "Const") ||
-        shape_node2 == nullptr ||
-        (shape_node2->op() != "Shape" && shape_node2->op() != "Const")) {
-      continue;
-    }
-    int64 min_id = 0;
-    BCast::Vec shape1;
-    if (!ExtractShape(*shape_node1, properties, &shape1, &min_id)) {
-      continue;
-    }
-    BCast::Vec shape2;
-    if (!ExtractShape(*shape_node2, properties, &shape2, &min_id)) {
-      continue;
-    }
-    // A value of -1 means we don't known anything about the dimension. Replace
-    // the -1 values with unique dimension ids since we don't want two '-1'
-    // dimensions to be considered equal.
-    for (auto& id : shape1) {
-      if (id == -1) {
-        id = --min_id;
-      }
-    }
-    for (auto& id : shape2) {
-      if (id == -1) {
-        id = --min_id;
-      }
-    }
-    BCast bcast(shape1, shape2);
-    if (!bcast.IsValid()) {
-      continue;
-    }
-    BCast::Vec reduce_dims[2];
-    reduce_dims[0] = bcast.grad_x_reduce_idx();
-    reduce_dims[1] = bcast.grad_y_reduce_idx();
-
-    const DataType type = node.attr().at("T").type();
-    NodeDef* out[2];
-    for (int j = 0; j < 2; ++j) {
-      if (!reduce_dims[j].empty()) {
-        // This is the case when a tensor dimension 1 is matched against an
-        // unknown dimension. The unknown dimension could also be equal to 1, in
-        // which case there would be no reduction.
-        out[j] = nullptr;
-      } else {
-        Tensor value(type, TensorShape({0}));
-        string const_name = AddPrefixToNodeName(
-            strings::StrCat(node.name(), "-", j), kConstantFoldingConst);
-        out[j] = node_map_->GetNode(const_name);
-        if (!out[j]) {
-          out[j] = graph_.add_node();
-          *out[j] = CreateNodeDef(const_name, TensorValue(&value));
-          out[j]->set_device(node.device());
-          node_map_->AddNode(const_name, out[j]);
-          string ctrl_dep =
-              AddControlDependency(node.name(), &graph_, node_map_.get());
-          *out[j]->add_input() = ctrl_dep;
-          node_map_->AddOutput(NodeName(ctrl_dep), const_name);
-        }
-      }
-    }
-
-    auto outputs = node_map_->GetOutputs(node.name());
-    for (const auto& output : outputs) {
-      for (int k = 0; k < output->input_size(); ++k) {
-        int port;
-        string node_name = ParseNodeName(output->input(k), &port);
-        if (node_name == node.name() && port >= 0 && port < 2 && out[port]) {
-          *output->mutable_input(k) = out[port]->name();
-          node_map_->UpdateInput(output->name(), node_name, out[port]->name());
-        }
-      }
-    }
-  }
-
-  return Status::OK();
-}
-
 bool ConstantFolding::IsFoldable(const NodeDef& node) const {
   // Folding not applicable to ops with no inputs.
   if (node.input().empty()) {
@@ -1069,23 +921,23 @@ Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
   }
 
   GraphProperties properties(item);
-  Status s = properties.InferStatically();
   bool has_feed = !item.feed.empty();
-
-  if (!has_feed && s.ok()) {
+  if (!has_feed) {
     // Only use static shape information when there is no feed in the
     // graph. That's because it's possible to feed a placeholder with a tensor
     // of any shape, which could make the static information inconsistent with
     // the shapes actually fed.
-    TF_RETURN_IF_ERROR(MaterializeShapes(item, properties));
-  }
-  if (opt_level_ == RewriterConfig::AGGRESSIVE && s.ok()) {
-    TF_RETURN_IF_ERROR(MaterializeConstants(item, properties));
+    Status s = properties.InferStatically();
+    if (!s.ok()) {
+      VLOG(1) << "Failed to infer graph shapes: " << s;
+    } else {
+      TF_RETURN_IF_ERROR(MaterializeShapes(item, properties));
+    }
   }
 
   TF_RETURN_IF_ERROR(FoldGraph(output));
 
-  if (!has_feed && s.ok()) {
+  if (!has_feed) {
     TF_RETURN_IF_ERROR(SimplifyGraph(output, properties));
   }
   return Status::OK();
@@ -1104,14 +956,12 @@ Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   GrapplerItem item_to_optimize = item;
   *output = item.graph;
-  int64 node_count;
   do {
     graph_.Swap(output);
     item_to_optimize.graph = graph_;
     *output = GraphDef();
-    node_count = graph_.node_size();
     TF_RETURN_IF_ERROR(RunOptimizationPass(cluster, item_to_optimize, output));
-  } while (output->node_size() != node_count);
+  } while (output->node_size() < graph_.node_size());
 
   *output->mutable_library() = item.graph.library();
   *output->mutable_versions() = item.graph.versions();
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index dd988f336c..30d778789a 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
-#include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -38,7 +37,6 @@ class ConstantFolding : public GraphOptimizer {
                                      NodeMap* node_map);
 
   ConstantFolding(DeviceBase* cpu_device);
-  ConstantFolding(RewriterConfig::Toggle opt_level, DeviceBase* cpu_device);
 
   ~ConstantFolding() override {}
 
@@ -53,8 +51,7 @@ class ConstantFolding : public GraphOptimizer {
  private:
   Status MaterializeShapes(const GrapplerItem& item,
                            const GraphProperties& properties);
-  Status MaterializeConstants(const GrapplerItem& item,
-                              const GraphProperties& properties);
+
   bool IsFoldable(const NodeDef& node) const;
 
   Status EvaluateNode(const NodeDef& node,
@@ -77,7 +74,6 @@ class ConstantFolding : public GraphOptimizer {
                              GraphDef* output);
 
   // Points to an externally provided device or to owned_device_;
-  RewriterConfig::Toggle opt_level_;
   DeviceBase* cpu_device_;
   std::unique_ptr<DeviceBase> owned_device_;
 
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 43f84b1ddf..a1dee6d2fb 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
-#include "tensorflow/cc/ops/array_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
@@ -839,85 +838,6 @@ TEST_F(ConstantFoldingTest, Packing) {
   // size needed to naively encode 1000 floats folded twice).
   EXPECT_GT(8000, output.ByteSizeLong());
 }
-
-TEST_F(ConstantFoldingTest, ConstantMaterialization) {
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output a =
-      ops::Placeholder(s.WithOpName("a"), DT_FLOAT,
-                       ops::Placeholder::Shape(PartialTensorShape({-1, -1})));
-  Output b = ops::Square(s.WithOpName("b"), a);
-  Output c = ops::Mul(s.WithOpName("c"), a, b);
-  Output d = ops::Shape(s.WithOpName("d"), a);
-  Output e = ops::Shape(s.WithOpName("e"), b);
-
-  auto f = ops::internal::BroadcastGradientArgs(s.WithOpName("f"), d, e);
-  Output o1 = ops::Identity(s.WithOpName("o1"), f.r0);
-  Output o2 = ops::Identity(s.WithOpName("o2"), f.r1);
-
-  Output g = ops::Placeholder(s.WithOpName("g"), DT_FLOAT,
-                              ops::Placeholder::Shape(PartialTensorShape({1})));
-  Output h = ops::Shape(s.WithOpName("h"), g);
-  auto i = ops::internal::BroadcastGradientArgs(s.WithOpName("i"), d, h);
-  Output p1 = ops::Identity(s.WithOpName("p1"), i.r0);
-  Output p2 = ops::Identity(s.WithOpName("p2"), i.r1);
-
-  GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
-  ConstantFolding fold(RewriterConfig::AGGRESSIVE, nullptr /* cpu_device */);
-  GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-
-  // Run a second time to make sure the optimization is idempotent.
-  item.graph.Swap(&output);
-  status = fold.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-
-  int found = 0;
-  for (const auto& node : output.node()) {
-    if (node.name() == "o1") {
-      ++found;
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("ConstantFolding/f-0", node.input(0));
-    } else if (node.name() == "o2") {
-      ++found;
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("ConstantFolding/f-1", node.input(0));
-    } else if (node.name() == "ConstantFolding/f-0") {
-      ++found;
-      EXPECT_EQ("Const", node.op());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("^f", node.input(0));
-      EXPECT_EQ(0, TensorShape(node.attr().at("value").tensor().tensor_shape())
-                       .num_elements());
-    } else if (node.name() == "ConstantFolding/f-1") {
-      ++found;
-      EXPECT_EQ("Const", node.op());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("^f", node.input(0));
-      EXPECT_EQ(0, TensorShape(node.attr().at("value").tensor().tensor_shape())
-                       .num_elements());
-    } else if (node.name() == "p1") {
-      ++found;
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("ConstantFolding/i-0", node.input(0));
-    } else if (node.name() == "p2") {
-      ++found;
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("i:1", node.input(0));
-    } else if (node.name() == "ConstantFolding/i-0") {
-      ++found;
-      EXPECT_EQ("Const", node.op());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("^i", node.input(0));
-      EXPECT_EQ(0, TensorShape(node.attr().at("value").tensor().tensor_shape())
-                       .num_elements());
-    }
-  }
-  EXPECT_EQ(7, found);
-}
-
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 6204a81f80..a9875c06d8 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -64,8 +64,8 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       optimizers.push_back(std::unique_ptr<GraphOptimizer>(new ModelPruner()));
     }
     if (cfg_.constant_folding() != RewriterConfig::OFF) {
-      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
-          new ConstantFolding(cfg_.constant_folding(), cpu_device_)));
+      optimizers.push_back(
+          std::unique_ptr<GraphOptimizer>(new ConstantFolding(cpu_device_)));
     }
     if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) {
       optimizers.push_back(std::unique_ptr<GraphOptimizer>(
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 11bd8fa5cb..54be02b5f8 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -45,6 +45,7 @@ NodeDef* NodeMap::GetNode(const string& name) const {
   string node_name = NodeName(name);
   auto it = nodes_.find(node_name);
   if (it == nodes_.end()) {
+    LOG(WARNING) << "Node " << node_name << " is not in the graph.";
     return nullptr;
   }
   return it->second;
@@ -61,7 +62,7 @@ const std::set<NodeDef*>& NodeMap::GetOutputs(const string& node_name) const {
 void NodeMap::AddNode(const string& name, NodeDef* node) {
   auto ret = nodes_.insert(std::make_pair(name, node));
   CHECK(ret.second) << "Pair (" << name << "," << node
-                    << ") is not inserted because a same key already exists.";
+                    << ") is not inserted because the same key already exists.";
 }
 
 void NodeMap::AddOutput(const string& node_name, const string& output_name) {
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 4169e842da..a5c62fef17 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -917,6 +917,25 @@ tf_cc_test(
 )
 
 tf_cuda_cc_test(
+    name = "bincount_op_test",
+    size = "small",
+    srcs = ["bincount_op_test.cc"],
+    deps = [
+        ":bincount_op",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+tf_cuda_cc_test(
     name = "constant_op_test",
     size = "small",
     srcs = ["constant_op_test.cc"],
@@ -1601,7 +1620,10 @@ DYNAMIC_DEPS = [
 tf_kernel_library(
     name = "dynamic_partition_op",
     prefix = "dynamic_partition_op",
-    deps = DYNAMIC_DEPS,
+    deps = DYNAMIC_DEPS + [
+        ":fill_functor",
+        ":gather_functor",
+    ] + if_cuda(["@cub_archive//:cub"]),
 )
 
 tf_kernel_library(
@@ -1671,7 +1693,7 @@ tf_kernel_library(
     ],
 )
 
-tf_cc_tests(
+tf_cuda_cc_tests(
     name = "dynamic_op_test",
     size = "small",
     srcs = [
@@ -2554,8 +2576,9 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "bucketize_op",
+    gpu_srcs = ["cuda_device_array.h"],
     prefix = "bucketize_op",
-    deps = MATH_DEPS,
+    deps = ARRAY_DEPS,
 )
 
 tf_kernel_library(
@@ -3156,7 +3179,7 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//third_party/eigen3",
-    ],
+    ] + if_cuda(["@cub_archive//:cub"]),
 )
 
 tf_kernel_library(
@@ -4420,15 +4443,6 @@ filegroup(
         "fill_functor.h",
         "function_ops.cc",
         "gather_functor.h",
-        "gather_nd_op.cc",
-        "gather_nd_op.h",
-        "gather_nd_op_cpu_impl.h",
-        "gather_nd_op_cpu_impl_0.cc",
-        "gather_nd_op_cpu_impl_1.cc",
-        "gather_nd_op_cpu_impl_2.cc",
-        "gather_nd_op_cpu_impl_3.cc",
-        "gather_nd_op_cpu_impl_4.cc",
-        "gather_nd_op_cpu_impl_5.cc",
         "gather_op.cc",
         "identity_n_op.cc",
         "identity_n_op.h",
@@ -4522,10 +4536,6 @@ filegroup(
         "fused_batch_norm_op.h",
         "gemm_functors.h",
         "image_resizer_state.h",
-        "initializable_lookup_table.h",
-        "lookup_table_init_op.h",
-        "lookup_table_op.h",
-        "lookup_util.h",
         "maxpooling_op.h",
         "mfcc.h",
         "mfcc_dct.h",
@@ -4542,7 +4552,6 @@ filegroup(
         "resize_nearest_neighbor_op.h",
         "reverse_op.h",
         "save_restore_tensor.h",
-        "segment_reduction_ops.h",
         "softplus_op.h",
         "softsign_op.h",
         "spacetobatch_functor.h",
@@ -4592,8 +4601,6 @@ filegroup(
         "cwise_op_div.cc",
         "cwise_op_equal_to_1.cc",
         "cwise_op_equal_to_2.cc",
-        "cwise_op_not_equal_to_1.cc",
-        "cwise_op_not_equal_to_2.cc",
         "cwise_op_exp.cc",
         "cwise_op_floor.cc",
         "cwise_op_floor_div.cc",
@@ -4635,7 +4642,6 @@ filegroup(
         "encode_wav_op.cc",
         "fake_quant_ops.cc",
         "fifo_queue.cc",
-        "fifo_queue_op.cc",
         "fused_batch_norm_op.cc",
         "population_count_op.cc",
         "population_count_op.h",
@@ -4659,11 +4665,7 @@ filegroup(
         "depthtospace_op.cc",
         "dynamic_stitch_op.cc",
         "in_topk_op.cc",
-        "initializable_lookup_table.cc",
         "logging_ops.cc",
-        "lookup_table_init_op.cc",
-        "lookup_table_op.cc",
-        "lookup_util.cc",
         "lrn_op.cc",
         "maxpooling_op.cc",
         "mfcc.cc",
@@ -4698,15 +4700,12 @@ filegroup(
         "save_op.cc",
         "save_restore_tensor.cc",
         "save_restore_v2_ops.cc",
-        "segment_reduction_ops.cc",
         "session_ops.cc",
         "softplus_op.cc",
         "softsign_op.cc",
         "spacetobatch_functor.cc",
         "spacetobatch_op.cc",
         "spacetodepth_op.cc",
-        "sparse_fill_empty_rows_op.cc",
-        "sparse_reshape_op.cc",
         "sparse_to_dense_op.cc",
         "spectrogram.cc",
         "spectrogram_op.cc",
@@ -4729,7 +4728,6 @@ filegroup(
         "training_ops.cc",
         "transpose_functor_cpu.cc",
         "transpose_op.cc",
-        "unique_op.cc",
         "warn_about_ints.cc",
         "where_op.cc",
         "xent_op.cc",
@@ -6243,11 +6241,8 @@ tf_kernel_library(
     srcs = ["summary_kernels.cc"],
     deps = [
         ":summary_interface",
-        "//tensorflow/contrib/tensorboard/db:summary_db_writer",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:summary_ops_op_lib",
-        "//tensorflow/core/lib/db:sqlite",
     ],
 )
 
diff --git a/tensorflow/core/kernels/avgpooling_op.cc b/tensorflow/core/kernels/avgpooling_op.cc
index af629d0de8..f918023693 100644
--- a/tensorflow/core/kernels/avgpooling_op.cc
+++ b/tensorflow/core/kernels/avgpooling_op.cc
@@ -153,7 +153,8 @@ class AvgPoolingOp<GPUDevice, T> : public UnaryOp<T> {
     if (data_format_ == FORMAT_NCHW) {
       DnnPoolingOp<T>::Compute(
           context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
-          stride_, padding_, data_format_, tensor_in, output_shape);
+          stride_, padding_, data_format_, tensor_in, output_shape,
+          /*propagate_nans=*/false);
     } else {
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context,
@@ -408,7 +409,7 @@ class AvgPoolingGradOp<GPUDevice, T> : public OpKernel {
     DnnPoolingGradOp<T>::Compute(
         context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
         stride_, padding_, data_format_, nullptr, nullptr, out_backprop,
-        output_shape);
+        output_shape, /*propagate_nans=*/false);
   }
 
  private:
@@ -532,7 +533,7 @@ class AvgPoolingGradOpCustomGPUKernel : public OpKernel {
       DnnPoolingGradOp<T>::Compute(
           context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
           stride_, padding_, data_format_, nullptr, nullptr, out_backprop,
-          output_shape);
+          output_shape, /*propagate_nans=*/false);
     }
   }
 
diff --git a/tensorflow/core/kernels/batch_dataset_op.cc b/tensorflow/core/kernels/batch_dataset_op.cc
index 6a5fd17a9e..2e52ad39f8 100644
--- a/tensorflow/core/kernels/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/batch_dataset_op.cc
@@ -143,13 +143,9 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
         // Each row of `batch_elements` is a tuple of tensors from the
         // input iterator.
         std::vector<std::vector<Tensor>> batch_elements;
+        batch_elements.reserve(dataset()->batch_size_);
         {
           mutex_lock l(mu_);
-          if (!input_impl_) {
-            *end_of_sequence = true;
-            return Status::OK();
-          }
-          batch_elements.reserve(dataset()->batch_size_);
           *end_of_sequence = false;
           for (int i = 0; i < dataset()->batch_size_ && !*end_of_sequence;
                ++i) {
@@ -158,8 +154,6 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
                                                     end_of_sequence));
             if (!*end_of_sequence) {
               batch_elements.emplace_back(std::move(batch_element_tuple));
-            } else {
-              input_impl_.reset();
             }
           }
         }
@@ -200,23 +194,14 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
      protected:
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
-        if (!input_impl_) {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("input_impl_empty"), ""));
-        } else {
-          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
-        }
+        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
         return Status::OK();
       }
 
       Status RestoreInternal(OpKernelContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
-        if (!reader->Contains(full_name("input_impl_empty"))) {
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
-        } else {
-          input_impl_.reset();
-        }
+        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
         return Status::OK();
       }
 
diff --git a/tensorflow/core/kernels/bincount_op.cc b/tensorflow/core/kernels/bincount_op.cc
index 1cd5943ef3..766d63e3be 100644
--- a/tensorflow/core/kernels/bincount_op.cc
+++ b/tensorflow/core/kernels/bincount_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#include "tensorflow/core/kernels/bincount_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
@@ -27,46 +28,37 @@ namespace tensorflow {
 
 using thread::ThreadPool;
 
-template <typename T>
-class BincountOp : public OpKernel {
- public:
-  explicit BincountOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
 
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor& arr_t = ctx->input(0);
-    const Tensor& size_tensor = ctx->input(1);
-    const Tensor& weights_t = ctx->input(2);
-    int32 size = size_tensor.scalar<int32>()();
-    OP_REQUIRES(
-        ctx, size >= 0,
-        errors::InvalidArgument("size (", size, ") must be non-negative"));
-    const bool has_weights = weights_t.NumElements() > 0;
-    OP_REQUIRES(ctx, !(has_weights && arr_t.shape() != weights_t.shape()),
-                errors::InvalidArgument(
-                    "If weights are passed, they must have the same shape (" +
-                    weights_t.shape().DebugString() + ") as arr (" +
-                    arr_t.shape().DebugString() + ")"));
-    const auto arr = arr_t.flat<int32>();
-    const auto weights = weights_t.flat<T>();
+namespace functor {
+
+template <typename T>
+struct BincountFunctor<CPUDevice, T> {
+  static Status Compute(OpKernelContext* context,
+                        const typename TTypes<int32, 1>::ConstTensor& arr,
+                        const typename TTypes<T, 1>::ConstTensor& weights,
+                        typename TTypes<T, 1>::Tensor& output) {
+    int size = output.size();
 
     Tensor all_nonneg_t;
-    OP_REQUIRES_OK(ctx,
-                   ctx->allocate_temp(DT_BOOL, TensorShape({}), &all_nonneg_t,
-                                      AllocatorAttributes()));
-    all_nonneg_t.scalar<bool>().device(ctx->eigen_cpu_device()) =
+    TF_RETURN_IF_ERROR(context->allocate_temp(
+        DT_BOOL, TensorShape({}), &all_nonneg_t, AllocatorAttributes()));
+    all_nonneg_t.scalar<bool>().device(context->eigen_cpu_device()) =
         (arr >= 0).all();
-    OP_REQUIRES(ctx, all_nonneg_t.scalar<bool>()(),
-                errors::InvalidArgument("Input arr must be non-negative!"));
+    if (!all_nonneg_t.scalar<bool>()()) {
+      return errors::InvalidArgument("Input arr must be non-negative!");
+    }
 
     // Allocate partial output bin sums for each worker thread. Worker ids in
     // ParallelForWithWorkerId range from 0 to NumThreads() inclusive.
     ThreadPool* thread_pool =
-        ctx->device()->tensorflow_cpu_worker_threads()->workers;
+        context->device()->tensorflow_cpu_worker_threads()->workers;
     const int64 num_threads = thread_pool->NumThreads() + 1;
     Tensor partial_bins_t;
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(weights_t.dtype(),
-                                           TensorShape({num_threads, size}),
-                                           &partial_bins_t));
+    TF_RETURN_IF_ERROR(context->allocate_temp(DataTypeToEnum<T>::value,
+                                              TensorShape({num_threads, size}),
+                                              &partial_bins_t));
     auto partial_bins = partial_bins_t.matrix<T>();
     partial_bins.setZero();
     thread_pool->ParallelForWithWorkerId(
@@ -75,7 +67,7 @@ class BincountOp : public OpKernel {
           for (int64 i = start_ind; i < limit_ind; i++) {
             int32 value = arr(i);
             if (value < size) {
-              if (has_weights) {
+              if (weights.size()) {
                 partial_bins(worker_id, value) += weights(i);
               } else {
                 // Complex numbers don't support "++".
@@ -84,25 +76,62 @@ class BincountOp : public OpKernel {
             }
           }
         });
-    TensorShape output_shape({size});
-    Tensor* output_t;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_t));
+
     // Sum the partial bins along the 0th axis.
     Eigen::array<int, 1> reduce_dims({0});
-    output_t->flat<T>().device(ctx->eigen_cpu_device()) =
-        partial_bins.sum(reduce_dims);
+    output.device(context->eigen_cpu_device()) = partial_bins.sum(reduce_dims);
+    return Status::OK();
+  }
+};
+
+}  // namespace functor
+
+template <typename Device, typename T>
+class BincountOp : public OpKernel {
+ public:
+  explicit BincountOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& arr_t = ctx->input(0);
+    const Tensor& size_tensor = ctx->input(1);
+    const Tensor& weights_t = ctx->input(2);
+
+    int32 size = size_tensor.scalar<int32>()();
+    OP_REQUIRES(ctx, size >= 0, errors::InvalidArgument(
+                                    "size (", size, ") must be non-negative"));
+
+    const auto arr = arr_t.flat<int32>();
+    const auto weights = weights_t.flat<T>();
+    Tensor* output_t;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, TensorShape({size}), &output_t));
+    auto output = output_t->flat<T>();
+    OP_REQUIRES_OK(ctx, functor::BincountFunctor<Device, T>::Compute(
+                            ctx, arr, weights, output));
   }
 };
 
-#define REGISTER(TYPE)                                               \
+#define REGISTER_KERNELS(type)                                       \
   REGISTER_KERNEL_BUILDER(                                           \
-      Name("Bincount").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
-      BincountOp<TYPE>)
+      Name("Bincount").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      BincountOp<CPUDevice, type>)
+
+TF_CALL_NUMBER_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+#if GOOGLE_CUDA
+
+#define REGISTER_KERNELS(type)                            \
+  REGISTER_KERNEL_BUILDER(Name("Bincount")                \
+                              .Device(DEVICE_GPU)         \
+                              .HostMemory("size")         \
+                              .TypeConstraint<type>("T"), \
+                          BincountOp<GPUDevice, type>)
 
-TF_CALL_NUMBER_TYPES(REGISTER);
+TF_CALL_int32(REGISTER_KERNELS);
+TF_CALL_float(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
 
-// TODO(ringwalt): Add a GPU implementation. We probably want to take a
-// different approach, e.g. threads in a warp each taking a pass over the same
-// data, and each thread summing a single bin.
+#endif  // GOOGLE_CUDA
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/bincount_op.h b/tensorflow/core/kernels/bincount_op.h
new file mode 100644
index 0000000000..0f8dd2b82a
--- /dev/null
+++ b/tensorflow/core/kernels/bincount_op.h
@@ -0,0 +1,41 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_BINCOUNT_OP_H_
+#define TENSORFLOW_BINCOUNT_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct BincountFunctor {
+  static Status Compute(OpKernelContext* context,
+                        const typename TTypes<int32, 1>::ConstTensor& arr,
+                        const typename TTypes<T, 1>::ConstTensor& weights,
+                        typename TTypes<T, 1>::Tensor& output);
+};
+
+}  // end namespace functor
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_BINCOUNT_OP_H_
diff --git a/tensorflow/core/kernels/bincount_op_gpu.cu.cc b/tensorflow/core/kernels/bincount_op_gpu.cu.cc
new file mode 100644
index 0000000000..ae9e26ffdf
--- /dev/null
+++ b/tensorflow/core/kernels/bincount_op_gpu.cu.cc
@@ -0,0 +1,114 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/bincount_op.h"
+#include "external/cub_archive/cub/device/device_histogram.cuh"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+template <typename T>
+struct BincountFunctor<GPUDevice, T> {
+  static Status Compute(OpKernelContext* context,
+                        const typename TTypes<int32, 1>::ConstTensor& arr,
+                        const typename TTypes<T, 1>::ConstTensor& weights,
+                        typename TTypes<T, 1>::Tensor& output) {
+    if (weights.size() != 0) {
+      return errors::InvalidArgument(
+          "Weights should not be passed as it should be "
+          "handled by unsorted_segment_sum");
+    }
+    if (output.size() == 0) {
+      return Status::OK();
+    }
+    // In case weight.size() == 0, use CUB
+    size_t temp_storage_bytes = 0;
+    const int32* d_samples = arr.data();
+    T* d_histogram = output.data();
+    int num_levels = output.size() + 1;
+    int32 lower_level = 0;
+    int32 upper_level = output.size();
+    int num_samples = arr.size();
+    const cudaStream_t& stream = GetCudaStream(context);
+
+    // The first HistogramEven is to obtain the temp storage size required
+    // with d_temp_storage = NULL passed to the call.
+    auto err = cub::DeviceHistogram::HistogramEven(
+        /* d_temp_storage */ NULL,
+        /* temp_storage_bytes */ temp_storage_bytes,
+        /* d_samples */ d_samples,
+        /* d_histogram */ d_histogram,
+        /* num_levels */ num_levels,
+        /* lower_level */ lower_level,
+        /* upper_level */ upper_level,
+        /* num_samples */ num_samples,
+        /* stream */ stream);
+    if (err != cudaSuccess) {
+      return errors::Internal(
+          "Could not launch HistogramEven to get temp storage: ",
+          cudaGetErrorString(err), ".");
+    }
+    Tensor temp_storage;
+    TF_RETURN_IF_ERROR(context->allocate_temp(
+        DataTypeToEnum<int8>::value,
+        TensorShape({static_cast<int64>(temp_storage_bytes)}), &temp_storage));
+
+    void* d_temp_storage = temp_storage.flat<int8>().data();
+    // The second HistogramEven is to actual run with d_temp_storage
+    // allocated with temp_storage_bytes.
+    err = cub::DeviceHistogram::HistogramEven(
+        /* d_temp_storage */ d_temp_storage,
+        /* temp_storage_bytes */ temp_storage_bytes,
+        /* d_samples */ d_samples,
+        /* d_histogram */ d_histogram,
+        /* num_levels */ num_levels,
+        /* lower_level */ lower_level,
+        /* upper_level */ upper_level,
+        /* num_samples */ num_samples,
+        /* stream */ stream);
+    if (err != cudaSuccess) {
+      return errors::Internal("Could not launch HistogramEven: ",
+                              cudaGetErrorString(err), ".");
+    }
+    return Status::OK();
+  }
+};
+
+}  // end namespace functor
+
+#define REGISTER_GPU_SPEC(type) \
+  template struct functor::BincountFunctor<GPUDevice, type>;
+
+TF_CALL_int32(REGISTER_GPU_SPEC);
+TF_CALL_float(REGISTER_GPU_SPEC);
+#undef REGISTER_GPU_SPEC
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/bincount_op_test.cc b/tensorflow/core/kernels/bincount_op_test.cc
new file mode 100644
index 0000000000..14becc87a7
--- /dev/null
+++ b/tensorflow/core/kernels/bincount_op_test.cc
@@ -0,0 +1,75 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+static Graph* Bincount(int arr_size, int nbins) {
+  Graph* g = new Graph(OpRegistry::Global());
+
+  Tensor arr(DT_INT32, TensorShape({arr_size}));
+  arr.flat<int32>() = arr.flat<int32>().setRandom().abs();
+
+  Tensor size(DT_INT32, TensorShape({(int32)1}));
+  size.flat<int32>()(0) = (int32)nbins;
+
+  Tensor weights(DT_INT32, TensorShape({0}));
+
+  Node* node;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Bincount")
+                  .Input(test::graph::Constant(g, arr))
+                  .Input(test::graph::Constant(g, size))
+                  .Input(test::graph::Constant(g, weights))
+                  .Attr("T", DT_INT32)
+                  .Finalize(g, &node));
+  return g;
+}
+
+#define BM_BincountDev(K, NBINS, type)                             \
+  static void BM_Bincount##_##type##_##K##_##NBINS(int iters) {    \
+    testing::ItemsProcessed(static_cast<int64>(iters) * K * 1024); \
+    test::Benchmark(#type, Bincount(K * 1024, NBINS)).Run(iters);  \
+  }                                                                \
+  BENCHMARK(BM_Bincount##_##type##_##K##_##NBINS);
+
+BM_BincountDev(32, 1000, cpu);
+BM_BincountDev(32, 2000, cpu);
+BM_BincountDev(32, 5000, cpu);
+BM_BincountDev(64, 1000, cpu);
+BM_BincountDev(64, 2000, cpu);
+BM_BincountDev(64, 5000, cpu);
+BM_BincountDev(128, 1000, cpu);
+BM_BincountDev(128, 2000, cpu);
+BM_BincountDev(128, 5000, cpu);
+
+BM_BincountDev(32, 1000, gpu);
+BM_BincountDev(32, 2000, gpu);
+BM_BincountDev(32, 5000, gpu);
+BM_BincountDev(64, 1000, gpu);
+BM_BincountDev(64, 2000, gpu);
+BM_BincountDev(64, 5000, gpu);
+BM_BincountDev(128, 1000, gpu);
+BM_BincountDev(128, 2000, gpu);
+BM_BincountDev(128, 5000, gpu);
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/bucketize_op.cc b/tensorflow/core/kernels/bucketize_op.cc
index 93c2d01221..c1693de538 100644
--- a/tensorflow/core/kernels/bucketize_op.cc
+++ b/tensorflow/core/kernels/bucketize_op.cc
@@ -15,15 +15,43 @@ limitations under the License.
 
 // See docs in ../ops/math_ops.cc.
 
-#include <algorithm>
-#include <vector>
-
+#include "tensorflow/core/kernels/bucketize_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
+using thread::ThreadPool;
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
 template <typename T>
+struct BucketizeFunctor<CPUDevice, T> {
+  // PRECONDITION: boundaries_vector must be sorted.
+  static Status Compute(OpKernelContext* context,
+                        const typename TTypes<T, 1>::ConstTensor& input,
+                        const std::vector<float>& boundaries_vector,
+                        typename TTypes<int32, 1>::Tensor& output) {
+    const int N = input.size();
+    for (int i = 0; i < N; i++) {
+      auto first_bigger_it = std::upper_bound(
+          boundaries_vector.begin(), boundaries_vector.end(), input(i));
+      output(i) = first_bigger_it - boundaries_vector.begin();
+    }
+
+    return Status::OK();
+  }
+};
+}  // namespace functor
+
+template <typename Device, typename T>
 class BucketizeOp : public OpKernel {
  public:
   explicit BucketizeOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -34,36 +62,42 @@ class BucketizeOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input_tensor = context->input(0);
-    auto input = input_tensor.flat<T>();
+    const auto input = input_tensor.flat<T>();
+
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
                                                      &output_tensor));
     auto output = output_tensor->template flat<int32>();
-
-    const int N = input.size();
-    for (int i = 0; i < N; i++) {
-      output(i) = CalculateBucketIndex(input(i));
-    }
+    OP_REQUIRES_OK(context, functor::BucketizeFunctor<Device, T>::Compute(
+                                context, input, boundaries_, output));
   }
 
  private:
-  int32 CalculateBucketIndex(const T value) {
-    auto first_bigger_it =
-        std::upper_bound(boundaries_.begin(), boundaries_.end(), value);
-    return first_bigger_it - boundaries_.begin();
-  }
   std::vector<float> boundaries_;
 };
 
 #define REGISTER_KERNEL(T)                                         \
   REGISTER_KERNEL_BUILDER(                                         \
       Name("Bucketize").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      BucketizeOp<T>);
+      BucketizeOp<CPUDevice, T>);
+
+REGISTER_KERNEL(int32);
+REGISTER_KERNEL(int64);
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+#define REGISTER_KERNEL(T)                                         \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("Bucketize").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      BucketizeOp<GPUDevice, T>);
 
 REGISTER_KERNEL(int32);
 REGISTER_KERNEL(int64);
 REGISTER_KERNEL(float);
 REGISTER_KERNEL(double);
 #undef REGISTER_KERNEL
+#endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/bucketize_op.h b/tensorflow/core/kernels/bucketize_op.h
new file mode 100644
index 0000000000..c8e461beb9
--- /dev/null
+++ b/tensorflow/core/kernels/bucketize_op.h
@@ -0,0 +1,41 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_BUCKETIZE_OP_H_
+#define TENSORFLOW_BUCKETIZE_OP_H_
+
+#include <vector>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct BucketizeFunctor {
+  static Status Compute(OpKernelContext* context,
+                        const typename TTypes<T, 1>::ConstTensor& input,
+                        const std::vector<float>& boundaries_vector,
+                        typename TTypes<int32, 1>::Tensor& output);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_BUCKETIZE_OP_H_
diff --git a/tensorflow/core/kernels/bucketize_op_gpu.cu.cc b/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
new file mode 100644
index 0000000000..aafbbe41b4
--- /dev/null
+++ b/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
@@ -0,0 +1,101 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/bucketize_op.h"
+#include "tensorflow/core/kernels/cuda_device_array.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename T>
+__global__ void BucketizeCustomKernel(
+    const int32 size_in, const T* in, const int32 size_boundaries,
+    CudaDeviceArrayStruct<float> boundaries_array, int32* out) {
+  const float* boundaries = GetCudaDeviceArrayOnDevice(&boundaries_array);
+  CUDA_1D_KERNEL_LOOP(i, size_in) {
+    T value = in[i];
+    int32 bucket = 0;
+    int32 count = size_boundaries;
+    while (count > 0) {
+      int32 l = bucket;
+      int32 step = count / 2;
+      l += step;
+      if (!(value < static_cast<T>(boundaries[l]))) {
+        bucket = ++l;
+        count -= step + 1;
+      } else {
+        count = step;
+      }
+    }
+    out[i] = bucket;
+  }
+}
+
+namespace functor {
+
+template <typename T>
+struct BucketizeFunctor<GPUDevice, T> {
+  // PRECONDITION: boundaries_vector must be sorted.
+  static Status Compute(OpKernelContext* context,
+                        const typename TTypes<T, 1>::ConstTensor& input,
+                        const std::vector<float>& boundaries_vector,
+                        typename TTypes<int32, 1>::Tensor& output) {
+    const GPUDevice& d = context->eigen_device<GPUDevice>();
+
+    CudaDeviceArrayOnHost<float> boundaries_array(context,
+                                                  boundaries_vector.size());
+    TF_RETURN_IF_ERROR(boundaries_array.Init());
+    for (int i = 0; i < boundaries_vector.size(); ++i) {
+      boundaries_array.Set(i, boundaries_vector[i]);
+    }
+    TF_RETURN_IF_ERROR(boundaries_array.Finalize());
+
+    CudaLaunchConfig config = GetCudaLaunchConfig(input.size(), d);
+    BucketizeCustomKernel<
+        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+        input.size(), input.data(), boundaries_vector.size(),
+        boundaries_array.data(), output.data());
+
+    return Status::OK();
+  }
+};
+}  // namespace functor
+
+#define REGISTER_GPU_SPEC(type) \
+  template struct functor::BucketizeFunctor<GPUDevice, type>;
+
+REGISTER_GPU_SPEC(int32);
+REGISTER_GPU_SPEC(int64);
+REGISTER_GPU_SPEC(float);
+REGISTER_GPU_SPEC(double);
+#undef REGISTER_GPU_SPEC
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc
index b0bec0c5dc..258ce15456 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@@ -74,14 +74,11 @@ REGISTER(qint16)
 REGISTER(qint32)
 REGISTER(bfloat16)
 
-#if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION) && \
-    !defined(__ANDROID_TYPES_FULL__)
-// Primarily used for SavedModel support on mobile. Registering it here only if
-// __ANDROID_TYPES_FULL__ is not defined, as that already register strings
+#if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION)
+// Primarily used for SavedModel support on mobile.
 REGISTER(string);
 #endif  // defined(IS_MOBILE_PLATFORM) &&
-        // !defined(SUPPORT_SELECTIVE_REGISTRATION) &&
-        // !defined(__ANDROID_TYPES_FULL__)
+        // !defined(SUPPORT_SELECTIVE_REGISTRATION)
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
diff --git a/tensorflow/core/kernels/concatenate_dataset_op.cc b/tensorflow/core/kernels/concatenate_dataset_op.cc
index c3bd89c479..711c234129 100644
--- a/tensorflow/core/kernels/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/concatenate_dataset_op.cc
@@ -104,10 +104,6 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);
-        if (!input_impl_) {
-          *end_of_sequence = true;
-          return Status::OK();
-        }
         while (i_ < 2) {
           TF_RETURN_IF_ERROR(
               input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
@@ -144,9 +140,7 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
         } else if (i_ == 2) {
           input_impl_.reset();
         }
-        if (input_impl_) {
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
-        }
+        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
         return Status::OK();
       }
 
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 21f5cb1716..f819fccbfb 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -236,6 +236,7 @@ class Conv3DBackpropInputOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("Conv3DBackpropInputV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       Conv3DBackpropInputOp<CPUDevice, T>);
+TF_CALL_half(REGISTER_CPU_KERNEL);
 TF_CALL_float(REGISTER_CPU_KERNEL);
 TF_CALL_double(REGISTER_CPU_KERNEL);
 #undef REGISTER_CPU_KERNEL
@@ -383,6 +384,7 @@ class Conv3DBackpropFilterOp : public OpKernel {
                               .Device(DEVICE_CPU)                             \
                               .TypeConstraint<T>("T"),                        \
                           Conv3DBackpropFilterOp<CPUDevice, T>);
+TF_CALL_half(REGISTER_CPU_KERNEL);
 TF_CALL_float(REGISTER_CPU_KERNEL);
 TF_CALL_double(REGISTER_CPU_KERNEL);
 #undef REGISTER_CPU_KERNEL
@@ -409,6 +411,7 @@ namespace functor {
       const std::array<int, 3>& padding_right,                        \
       typename TTypes<T, 5, int>::Tensor out, TensorFormat format);
 
+DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
@@ -1098,22 +1101,29 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
   bool cudnn_use_autotune_;
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("Conv3DBackpropInput").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    Conv3DBackpropInputOp<GPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropInputV2")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<float>("T")
-                            .HostMemory("input_sizes"),
-                        Conv3DBackpropInputOp<GPUDevice, float>);
-REGISTER_KERNEL_BUILDER(
-    Name("Conv3DBackpropFilter").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    Conv3DBackpropFilterOp<GPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<float>("T")
-                            .HostMemory("filter_sizes"),
-                        Conv3DBackpropFilterOp<GPUDevice, float>);
+
+
+#define REGISTER_GPU_KERNEL(T)                                                \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("Conv3DBackpropInput").Device(DEVICE_GPU).TypeConstraint<T>("T"),  \
+      Conv3DBackpropInputOp<GPUDevice, T>);                                   \
+  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropInputV2")                       \
+                            .Device(DEVICE_GPU)                               \
+                            .TypeConstraint<T>("T")                           \
+                            .HostMemory("input_sizes"),                       \
+                        Conv3DBackpropInputOp<GPUDevice, T>);                 \
+  REGISTER_KERNEL_BUILDER(                                                    \
+    Name("Conv3DBackpropFilter").Device(DEVICE_GPU).TypeConstraint<T>("T"),   \
+    Conv3DBackpropFilterOp<GPUDevice, T>);                                    \
+  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")                      \
+                            .Device(DEVICE_GPU)                               \
+                            .TypeConstraint<T>("T")                           \
+                            .HostMemory("filter_sizes"),                      \
+                        Conv3DBackpropFilterOp<GPUDevice, T>);
+TF_CALL_half(REGISTER_GPU_KERNEL);
+TF_CALL_float(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+     
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 8a89d564de..37cb67bc51 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -145,6 +145,7 @@ class Conv3DOp : public BinaryOp<T> {
   REGISTER_KERNEL_BUILDER(                                      \
       Name("Conv3D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       Conv3DOp<CPUDevice, T>);
+TF_CALL_half(REGISTER_CPU_KERNEL);
 TF_CALL_float(REGISTER_CPU_KERNEL);
 TF_CALL_double(REGISTER_CPU_KERNEL);
 #undef REGISTER_CPU_KERNEL
@@ -482,6 +483,7 @@ namespace functor {
       const std::array<int, 3>& padding_right,                        \
       typename TTypes<T, 5, int>::Tensor out, TensorFormat format);
 
+DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
 #undef DECLARE_GPU_SPEC
 
@@ -489,6 +491,9 @@ DECLARE_GPU_SPEC(float);
 
 // Registration of the GPU implementations.
 REGISTER_KERNEL_BUILDER(
+    Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
+    Conv3DOp<GPUDevice, Eigen::half>);
+REGISTER_KERNEL_BUILDER(
     Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<float>("T"),
     Conv3DOp<GPUDevice, float>);
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_acosh.cc b/tensorflow/core/kernels/cwise_op_acosh.cc
index 7bdd8d22a3..39c8814073 100644
--- a/tensorflow/core/kernels/cwise_op_acosh.cc
+++ b/tensorflow/core/kernels/cwise_op_acosh.cc
@@ -20,16 +20,8 @@ namespace tensorflow {
 REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double,
           complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Acosh")                               \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::acosh<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "Acosh", functor::acosh, float, double);
 #endif // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_asinh.cc b/tensorflow/core/kernels/cwise_op_asinh.cc
index e0644323c0..8d44208aa7 100644
--- a/tensorflow/core/kernels/cwise_op_asinh.cc
+++ b/tensorflow/core/kernels/cwise_op_asinh.cc
@@ -20,17 +20,9 @@ namespace tensorflow {
 REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double,
           complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Asinh")                               \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::asinh<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYC
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "Asinh", functor::asinh, float, double);
+#endif // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Asinh", functor::asinh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_atanh.cc b/tensorflow/core/kernels/cwise_op_atanh.cc
index 058f5140c5..bbc69e45aa 100644
--- a/tensorflow/core/kernels/cwise_op_atanh.cc
+++ b/tensorflow/core/kernels/cwise_op_atanh.cc
@@ -20,17 +20,9 @@ namespace tensorflow {
 REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double,
           complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Atanh")                               \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::atanh<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYC
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "Atanh", functor::atanh, float, double);
+#endif // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Atanh", functor::atanh, float, double);
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index 6c22b124de..d32185b6bf 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -49,7 +49,11 @@ template <typename T>
 struct scalar_asinh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_asinh_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
+#if EIGEN_HAS_CXX11_MATH
+    return numext::asinh(a);
+#else
     return std::asinh(a);
+#endif  // EIGEN_HAS_CXX11_MATH
   }
 };
 template <typename T>
@@ -61,7 +65,11 @@ template <typename T>
 struct scalar_acosh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_acosh_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
+#if EIGEN_HAS_CXX11_MATH
+    return numext::acosh(a);
+#else
     return std::acosh(a);
+#endif  // EIGEN_HAS_CXX11_MATH
   }
 };
 template <typename T>
@@ -73,7 +81,11 @@ template <typename T>
 struct scalar_atanh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_atanh_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
+#if EIGEN_HAS_CXX11_MATH
+    return numext::atanh(a);
+#else
     return std::atanh(a);
+#endif  // EIGEN_HAS_CXX11_MATH
   }
 };
 template <typename T>
diff --git a/tensorflow/core/kernels/dataset.cc b/tensorflow/core/kernels/dataset.cc
index fcfa2956f7..0414875a5d 100644
--- a/tensorflow/core/kernels/dataset.cc
+++ b/tensorflow/core/kernels/dataset.cc
@@ -126,6 +126,7 @@ void BinaryDatasetOpKernel::MakeDataset(OpKernelContext* ctx,
   MakeDataset(ctx, input, another_input, output);
 }
 
+const char IteratorBase::kIteratorExhausted[] = "ITERATOR_EXHAUSTED";
 const char GraphDatasetBase::kDatasetGraphKey[] = "_DATASET_GRAPH";
 const char GraphDatasetBase::kDatasetGraphOutputNodeKey[] =
     "_DATASET_GRAPH_OUTPUT_NODE";
diff --git a/tensorflow/core/kernels/dataset.h b/tensorflow/core/kernels/dataset.h
index aa4f436b39..4a42ac80c3 100644
--- a/tensorflow/core/kernels/dataset.h
+++ b/tensorflow/core/kernels/dataset.h
@@ -306,14 +306,27 @@ class IteratorBase {
 
   // Saves the state of this iterator.
   virtual Status Save(IteratorStateWriter* writer) {
-    return SaveInternal(writer);
+    if (is_exhausted_) {
+      LOG(INFO) << "Iterator exhausted.";
+      return writer->WriteScalar(kIteratorExhausted, kIteratorExhausted);
+    } else {
+      return SaveInternal(writer);
+    }
   }
 
   // Restores the state of this iterator.
   virtual Status Restore(OpKernelContext* ctx, IteratorStateReader* reader) {
-    return RestoreInternal(ctx, reader);
+    if (reader->Contains(kIteratorExhausted)) {
+      LOG(INFO) << "Iterator exhausted. Nothing to restore.";
+      is_exhausted_ = true;
+      return Status::OK();
+    } else {
+      return RestoreInternal(ctx, reader);
+    }
   }
 
+  static const char kIteratorExhausted[];
+
  protected:
   // This is needed so that sub-classes of IteratorBase can call
   // `SaveInternal` on their parent iterators, e.g., in
@@ -341,6 +354,8 @@ class IteratorBase {
                                  IteratorStateReader* reader) {
     return errors::Unimplemented("RestoreInternal");
   }
+
+  bool is_exhausted_ = false;  // Whether the iterator has been exhausted.
 };
 
 // Represents a (potentially infinite) range of outputs, where each
@@ -476,6 +491,10 @@ class DatasetIterator : public IteratorBase {
   Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                  bool* end_of_sequence) final {
     port::Tracing::TraceMe activity(params_.prefix);
+    if (is_exhausted_) {
+      *end_of_sequence = true;
+      return Status::OK();
+    }
     return GetNextInternal(ctx, out_tensors, end_of_sequence);
   }
 
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index 9804d7d38e..53d65a22d1 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -231,7 +231,7 @@ static void CopyOutputBackpropRegion(const DepthwiseArgs& args,
       }
       // Pad to vector-register width (if needed).
       for (int64 d = 0; d < pad_size; ++d) {
-        buffer[buf_base + vectorized_size + scalar_size + d] = 0;
+        buffer[buf_base + vectorized_size + scalar_size + d] = static_cast<T>(0);
       }
     }
   }
@@ -297,7 +297,7 @@ static void ComputeBackpropInput(const DepthwiseArgs& args,
 
   for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
     // Reset accumulator.
-    auto vaccum = Eigen::internal::pset1<Packet>(0);
+    auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
     for (int j = 0; j < filter_spatial_size; ++j) {
       // Calculate index.
       const int64 index = i + j * padded_filter_inner_dim_size;
@@ -318,7 +318,7 @@ static void ComputeBackpropInput(const DepthwiseArgs& args,
   }
 
   if (output_scalar_size > 0) {
-    auto vaccum = Eigen::internal::pset1<Packet>(0);
+    auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
     for (int j = 0; j < filter_spatial_size; ++j) {
       const int64 index =
           output_vectorized_size + j * padded_filter_inner_dim_size;
@@ -346,7 +346,7 @@ static void ComputeBackpropInput(const DepthwiseArgs& args,
   if (depth_multiplier > 1) {
     for (int64 d = 0; d < in_depth; ++d) {
       const int64 index = d * args.depth_multiplier;
-      T accum = 0;
+      T accum = static_cast<T>(0);
       for (int64 dm = 0; dm < dm_vectorized_size; dm += kPacketSize) {
         const auto v = Eigen::internal::ploadu<Packet>(out_buffer + index + dm);
         accum += Eigen::internal::predux(v);
@@ -510,6 +510,7 @@ static void DepthwiseConvBackpropInputReference(const DepthwiseArgs& args,
 
 #if GOOGLE_CUDA
 
+extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, Eigen::half>;
 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;
 
@@ -884,6 +885,7 @@ static void DepthwiseConvBackpropFilterReference(const DepthwiseArgs& args,
 
 #if GOOGLE_CUDA
 
+extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, Eigen::half>;
 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;
 
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index bbeeaf7895..2759ecb2f1 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -94,7 +94,7 @@ struct DepthwiseConv2DKernel {
 
     for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
       // Reset accumulator.
-      auto vaccum = Eigen::internal::pset1<Packet>(0);
+      auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
       for (int j = 0; j < filter_spatial_size; ++j) {
         // Calculate index.
         const int64 index = i + j * padded_filter_inner_dim_size;
@@ -115,7 +115,7 @@ struct DepthwiseConv2DKernel {
     }
 
     if (output_scalar_size > 0) {
-      auto vaccum = Eigen::internal::pset1<Packet>(0);
+      auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
       for (int j = 0; j < filter_spatial_size; ++j) {
         const int64 index =
             output_vectorized_size + j * padded_filter_inner_dim_size;
@@ -246,6 +246,7 @@ extern template class LaunchConv2DOp<CPUDevice, float>;
 #if GOOGLE_CUDA
 
 // Extern template instantiated in depthwise_conv_op_gpu.cc.
+extern template struct LaunchDepthwiseConvOp<GPUDevice, Eigen::half>;
 extern template struct LaunchDepthwiseConvOp<GPUDevice, float>;
 extern template struct LaunchDepthwiseConvOp<GPUDevice, double>;
 
@@ -419,6 +420,7 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
       Name("DepthwiseConv2dNative").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       DepthwiseConv2dNativeOp<CPUDevice, T>);
 
+TF_CALL_half(REGISTER_CPU_KERNEL);
 TF_CALL_float(REGISTER_CPU_KERNEL);
 #if !defined(PLATFORM_WINDOWS) || !defined(_DEBUG)
 TF_CALL_double(REGISTER_CPU_KERNEL);
@@ -426,6 +428,10 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
 
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(
+    Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
+    DepthwiseConv2dNativeOp<GPUDevice, Eigen::half>);
+
+REGISTER_KERNEL_BUILDER(
     Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<float>("T"),
     DepthwiseConv2dNativeOp<GPUDevice, float>);
 
diff --git a/tensorflow/core/kernels/depthwise_conv_op.h b/tensorflow/core/kernels/depthwise_conv_op.h
index aa5b5c76f6..11aed5b415 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.h
+++ b/tensorflow/core/kernels/depthwise_conv_op.h
@@ -158,7 +158,7 @@ struct DepthwiseFilterPadOp {
       }
       // Pad the remainder of output to vector-register boundary.
       for (int64 j = 0; j < pad_size; ++j) {
-        padded_filter[output_base + vectorized_size + scalar_size + j] = 0;
+        padded_filter[output_base + vectorized_size + scalar_size + j] = static_cast<T>(0);
       }
     }
   }
@@ -266,7 +266,7 @@ struct DepthwiseInputCopyOp {
 
           // Pad the remainder of the output to vector register boundary.
           for (int64 d = 0; d < output_pad_size; ++d) {
-            in_buf[d] = 0;
+            in_buf[d] = static_cast<T>(0);
           }
           in_buf += output_pad_size;
 
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index ecfe51d599..903aac5d68 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -105,7 +105,7 @@ __global__ void __launch_bounds__(1024, 2)
     const int input_row_end = input_row_start + filter_rows;
     const int input_col_end = input_col_start + filter_cols;
 
-    T sum = 0;
+    T sum = static_cast<T>(0);
 
     const int input_offset_temp = in_rows * OB;
     if (input_row_start >= 0 && input_col_start >= 0 &&
@@ -258,8 +258,8 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
     __syncthreads();
 
     if (depth_in_range) {
-      T sum1 = 0;
-      T sum2 = 0;
+      T sum1 = static_cast<T>(0);
+      T sum2 = static_cast<T>(0);
       int shared_offset = data_idx;
       const T* filter_ptr = filter_read_offset + shared_data;
       UNROLL for (int r = 0; r < filter_rows; ++r) {
@@ -369,7 +369,7 @@ __global__ void __launch_bounds__(1024, 2)
     const int input_row_end = input_row_start + filter_rows;
     const int input_col_end = input_col_start + filter_cols;
 
-    T sum = 0;
+    T sum = static_cast<T>(0);
     if (input_row_start >= 0 && input_col_start >= 0 &&
         input_row_end < in_rows && input_col_end < in_cols) {
       // Loop that doesn't need to check for boundary conditions.
@@ -529,8 +529,8 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
     __syncthreads();
 
     if (slice_in_range) {
-      T sum1 = 0;
-      T sum2 = 0;
+      T sum1 = static_cast<T>(0);
+      T sum2 = static_cast<T>(0);
       int shared_offset = data_idx;
       const T* filter_ptr = filter_read_offset + shared_data;
       UNROLL for (int r = 0; r < filter_rows; ++r) {
@@ -710,6 +710,7 @@ void LaunchDepthwiseConvOp<GPUDevice, T>::operator()(OpKernelContext* ctx,
                   "Launch of gpu kernel for DepthwiseConv2dGPULaunch failed"));
 }
 
+template struct LaunchDepthwiseConvOp<GPUDevice, Eigen::half>;
 template struct LaunchDepthwiseConvOp<GPUDevice, float>;
 template struct LaunchDepthwiseConvOp<GPUDevice, double>;
 
@@ -744,7 +745,7 @@ __global__ void __launch_bounds__(640, 2)
     const int in_r = (thread_id / in_depth / in_cols) % in_rows;
     const int b = thread_id / in_depth / in_cols / in_rows;
 
-    T sum = 0;
+    T sum = static_cast<T>(0);
 
     const int out_r_start =
         tf_max<int>(0, (in_r - filter_rows + pad_rows + stride) / stride);
@@ -810,7 +811,7 @@ __global__ void __launch_bounds__(640, 2)
     const int in_d = (thread_id / in_cols / in_rows) % in_depth;
     const int b = thread_id / in_depth / in_cols / in_rows;
 
-    T sum = 0;
+    T sum = static_cast<T>(0);
     const int out_d_start = in_d * depth_multiplier;
     const int out_d_end = out_d_start + depth_multiplier;
 
@@ -919,6 +920,7 @@ void LaunchDepthwiseConvBackpropInputOp<GPUDevice, T>::operator()(
                                "utGPULaunch failed"));
 }
 
+template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, Eigen::half>;
 template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
 template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;
 
@@ -1631,6 +1633,7 @@ void LaunchDepthwiseConvBackpropFilterOp<GPUDevice, T>::operator()(
                                "terGPULaunch failed"));
 }
 
+template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, Eigen::half>;
 template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
 template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
new file mode 100644
index 0000000000..7249c8c66c
--- /dev/null
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -0,0 +1,376 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The algorithm for dynamic partition has the following steps:
+// 1. Let N be the size of partitions. We initialize a new vector indices_in
+//    with the values 0, 1, 2, ..., N-1.
+// 2. We apply cub::DeviceRadixSort::SortPairs to the key - value pairs given
+//    by partitions and indices_in. This will result in two new vectors
+//    partitions_out and indices_out, with partitions_out sorted.
+// 3. The first dimension of outputs[i] is equal to the length of the interval
+//    of i-values in partitions_out. We determine it in two steps:
+//    - compute the starting and ending point of each interval,
+//    - subtract the starting and ending points to find the length.
+//    The result is placed in partition_count.
+// 4. Because partition_count is on the GPU, we bring it asynchronously to
+//    the CPU. Then we can allocate the output tensors.
+// 5. Finally, we use indices_out and the gather functor to collect the output.
+//    This works, because for each interval of i-values, indices_out points
+//    to the slices which should form output[i].
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "external/cub_archive/cub/device/device_radix_sort.cuh"
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/kernels/gather_functor_gpu.cu.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace {
+
+template <typename T>
+__global__ void RangeInitKernel(const T start, const T delta, const int32 size,
+                                T* out) {
+  CUDA_1D_KERNEL_LOOP(i, size) { out[i] = start + i * delta; }
+}
+
+__global__ void FindEndpointsKernel(const int32* partitions, int32 size,
+                                    int32 nump, int32* start, int32* end) {
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    int32 current = ldg(partitions + i);
+    if (FastBoundsCheck(current, nump)) {
+      if (i == 0)
+        start[current] = i;
+      else {
+        int32 before = ldg(partitions + i - 1);
+        if (before != current) start[current] = i;
+      }
+      if (i == size - 1)
+        end[current] = i + 1;
+      else {
+        int32 after = ldg(partitions + i + 1);
+        if (after != current) end[current] = i + 1;
+      }
+    }
+  }
+}
+
+// We create a local version of subtract, because the tf.subtract kernel
+// is not defined for int32. We use it to compute the length of an interval
+// by subtracting the endpoints.
+__global__ void IntervalLengthKernel(int32* start, int32 size, int32* end) {
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    int32 start_point = ldg(start + i);
+    end[i] = end[i] - start_point;
+  }
+}
+
+// Initialize out with range start, start + delta, start + 2 * delta, ...
+// This is needed because tf.range has no GPU implementation.
+template <typename T>
+void RangeInit(const GPUDevice& d, const T start, const T delta,
+               const int32 size, typename TTypes<T>::Flat out) {
+  CudaLaunchConfig config = GetCudaLaunchConfig(size, d);
+  RangeInitKernel<
+      T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+      start, delta, size, out.data());
+}
+
+// Partitions is a sorted vector of N non-negative integer numbers.
+// This function computes the starting and ending points of each interval
+// of values.
+void ComputeIntervals(const GPUDevice& d, Tensor* partitions, int32 N,
+                      int32 nump, int32* start_ptr, int32* end_ptr) {
+  CudaLaunchConfig config = GetCudaLaunchConfig(N, d);
+  FindEndpointsKernel<<<config.block_count, config.thread_per_block, 0,
+                        d.stream()>>>(partitions->flat<int32>().data(), N, nump,
+                                      start_ptr, end_ptr);
+}
+
+// Subtract the ending points of each interval to obtain the interval length.
+void ComputeItvLength(const GPUDevice& d, int32 num, int32* start_ptr,
+                      int32* end_ptr) {
+  CudaLaunchConfig config = GetCudaLaunchConfig(num, d);
+  IntervalLengthKernel<<<config.block_count, config.thread_per_block, 0,
+                         d.stream()>>>(start_ptr, num, end_ptr);
+}
+
+template <typename T>
+void CallGatherKernel(const GPUDevice& d, const T* params, const int32* indices,
+                      T* out, int64 gather_dim_size, int64 indices_size,
+                      int64 slice_size, int64 out_size) {
+  CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
+  GatherOpKernel<
+      T, int32,
+      true><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+      params, indices, out, gather_dim_size, indices_size, slice_size,
+      out_size);
+}
+
+}  // namespace
+
+// The current implementation has memory cost on GPU
+// I + P + max(3N + R, O + N), where:
+// I - the size of the input
+// N - the size of the partitions tensor
+// R - the temporary storage used by cub::RadixSort, about 2N
+// P - the number of partitions
+// O - the size of the output
+// So roughly the cost is I + P + max(5N, O + N).
+template <typename T>
+class DynamicPartitionOpGPU : public AsyncOpKernel {
+ public:
+  explicit DynamicPartitionOpGPU(OpKernelConstruction* c) : AsyncOpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("num_partitions", &num_partitions_));
+    OP_REQUIRES(c, num_partitions_ >= 1,
+                errors::InvalidArgument("num_partitions must be at least 1"));
+  }
+
+  void AllocateTempSpace(OpKernelContext* c, int32 N, Tensor* indices_in,
+                         Tensor* partitions_out, Tensor* indices_out,
+                         DoneCallback done) {
+    int32 M = std::max(N, num_partitions_);
+    // indices_in will be made slightly larger to accomodate
+    // later computations.
+    OP_REQUIRES_OK_ASYNC(
+        c, c->allocate_temp(DT_INT32, TensorShape({M}), indices_in), done);
+    OP_REQUIRES_OK_ASYNC(
+        c, c->allocate_temp(DT_INT32, TensorShape({N}), partitions_out), done);
+    OP_REQUIRES_OK_ASYNC(
+        c, c->allocate_temp(DT_INT32, TensorShape({N}), indices_out), done);
+  }
+
+  void AllocateOutputs(OpKernelContext* c, const Tensor* data,
+                       const Tensor* partitions, const Tensor* partition_count,
+                       OpOutputList* Tout, DoneCallback done) {
+    auto e_part_count = partition_count->flat<int32>();
+    // Allocate output tensors of the right size
+    OP_REQUIRES_OK_ASYNC(c, c->output_list("outputs", Tout), done);
+    for (int p = 0; p < num_partitions_; p++) {
+      TensorShape shape;
+      shape.AddDim(e_part_count(p));
+      for (int i = partitions->dims(); i < data->dims(); i++) {
+        shape.AddDim(data->dim_size(i));
+      }
+      Tensor* out;
+      OP_REQUIRES_OK_ASYNC(c, Tout->allocate(p, shape, &out), done);
+    }
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) {
+    const Tensor& data = c->input(0);
+    const Tensor& partitions = c->input(1);
+
+    OP_REQUIRES_ASYNC(
+        c, TensorShapeUtils::StartsWith(data.shape(), partitions.shape()),
+        errors::InvalidArgument("data.shape must start with partitions.shape, ",
+                                "got data.shape = ", data.shape().DebugString(),
+                                ", partitions.shape = ",
+                                partitions.shape().DebugString()),
+        done);
+
+    Tensor partition_count;
+
+    // We must handle the case of empty partitions separately,
+    // because kernels don't work with 0-sized tensors.
+    if (partitions.NumElements() == 0) {
+      AllocatorAttributes alloc_attr;
+      alloc_attr.set_on_host(true);
+      OP_REQUIRES_OK_ASYNC(
+          c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+                              &partition_count, alloc_attr),
+          done);
+      auto e_part_count = partition_count.flat<int32>();
+      for (int i = 0; i < num_partitions_; i++) e_part_count(i) = 0;
+      OpOutputList outputs;
+      this->AllocateOutputs(c, &data, &partitions, &partition_count, &outputs,
+                            done);
+      if (c->status().ok()) done();
+      return;
+    }
+
+    // Prepare for counting.
+    OP_REQUIRES_OK_ASYNC(
+        c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+                            &partition_count),
+        done);
+    Tensor indices_out;
+    // Count how many times each partition index occurs.
+    // Also sort the info in partitions and output it in indices_out,
+    // in preparation for the next step.
+    this->CountAndSortParts(c, &partitions, &partition_count, &indices_out,
+                            done);
+    if (!c->status().ok()) return;
+
+    // In order to allocate the output tensor we have to move partition_count
+    // to CPU.
+    auto* stream = c->op_device_context()->stream();
+    OP_REQUIRES_ASYNC(c, stream, errors::Internal("No GPU stream available."),
+                      done);
+    Tensor cpu_tensor;
+    AllocatorAttributes alloc_attr;
+    alloc_attr.set_on_host(true);
+    alloc_attr.set_gpu_compatible(true);
+    OP_REQUIRES_OK_ASYNC(
+        c, c->allocate_temp(partition_count.dtype(), partition_count.shape(),
+                            &cpu_tensor, alloc_attr),
+        done);
+    perftools::gputools::DeviceMemoryBase wrapped(
+        partition_count.flat<int32>().data(), num_partitions_ * sizeof(int32));
+    const bool status =
+        stream
+            ->ThenMemcpy(cpu_tensor.flat<int32>().data(), wrapped,
+                         num_partitions_ * sizeof(int32))
+            .ok();
+    OP_REQUIRES_ASYNC(
+        c, status,
+        errors::Internal("Failed to launch copy from device to host."), done);
+
+    // Keep a reference to partition_count so that the buffer
+    // is not deallocated at the end of the function, before
+    // memcpy is completed.
+    TensorReference partition_ref(partition_count);
+    auto wrapped_callback = [this, c, &data, &partitions, indices_out,
+                             partition_ref, cpu_tensor, done]() {
+      OpOutputList outputs;
+      this->AllocateOutputs(c, &data, &partitions, &cpu_tensor, &outputs, done);
+      if (!c->status().ok()) {
+        partition_ref.Unref();
+        return;
+      }
+      int32 N = partitions.NumElements();
+      int64 slice_size = data.NumElements() / N;
+      this->GatherSlices(c, &data, &indices_out, N, slice_size, outputs);
+      partition_ref.Unref();
+      done();
+    };
+
+    c->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute(
+        stream, wrapped_callback);
+  }
+
+ protected:
+  void RadixSort(OpKernelContext* c, const Tensor* partitions,
+                 Tensor* indices_in, Tensor* partitions_out,
+                 Tensor* indices_out, DoneCallback done) {
+    int32 N = partitions->NumElements();
+    const GPUDevice& device = c->eigen_device<GPUDevice>();
+    const cudaStream_t& cu_stream = GetCudaStream(c);
+
+    // Initialize the indices_in tensor using the Range GPU kernel.
+    RangeInit(device, 0, 1, N, indices_in->flat<int32>());
+    // Obtain the pointers to inner buffers.
+    const int32* partitions_ptr = partitions->flat<int32>().data();
+    int32* partitions_out_ptr = partitions_out->flat<int32>().data();
+    int32* indices_in_ptr = indices_in->flat<int32>().data();
+    int32* indices_out_ptr = indices_out->flat<int32>().data();
+    // Determine temporary device storage requirements.
+    Tensor cub_temp_storage;
+    size_t temp_storage_bytes = 0;
+    cub::DeviceRadixSort::SortPairs(
+        NULL, temp_storage_bytes, partitions_ptr, partitions_out_ptr,
+        indices_in_ptr, indices_out_ptr, N, 0, sizeof(int32) * 8, cu_stream);
+    // Allocate temporary storage.
+    OP_REQUIRES_OK_ASYNC(
+        c, c->allocate_temp(
+               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+               &cub_temp_storage),
+        done);
+    // Radix-sort the partition information.
+    cub::DeviceRadixSort::SortPairs(
+        cub_temp_storage.flat<int8>().data(), temp_storage_bytes,
+        partitions_ptr, partitions_out_ptr, indices_in_ptr, indices_out_ptr, N,
+        0, sizeof(int32) * 8, cu_stream);
+  }  // At this point cub_temp_storage will be marked for deallocation.
+
+  void CountAndSortParts(OpKernelContext* c, const Tensor* partitions,
+                         Tensor* partition_count, Tensor* indices_out,
+                         DoneCallback done) {
+    const GPUDevice& device = c->eigen_device<GPUDevice>();
+    int32 N = partitions->NumElements();
+    Tensor indices_in;
+    Tensor partitions_out;
+
+    // Allocate memory for Radix-Sort.
+    this->AllocateTempSpace(c, N, &indices_in, &partitions_out, indices_out,
+                            done);
+    if (!c->status().ok()) return;
+    this->RadixSort(c, partitions, &indices_in, &partitions_out, indices_out,
+                    done);
+    if (!c->status().ok()) return;
+    // We still need a little bit of additional memory. However,
+    // we can reuse the indices_in tensor. We could also use atomic
+    // operations and no additional memory, but this approach seems faster.
+
+    // Zero-out the allocated memory.
+    functor::SetZeroFunctor<GPUDevice, int32> zero_functor;
+    zero_functor(device, partition_count->flat<int32>());
+    zero_functor(device, indices_in.flat<int32>());
+    // Obtain the pointers to inner buffers.
+    int32* start_ptr = indices_in.flat<int32>().data();
+    int32* end_ptr = partition_count->flat<int32>().data();
+    // Obtain the starting and ending points of each interval.
+    ComputeIntervals(device, &partitions_out, N, num_partitions_, start_ptr,
+                     end_ptr);
+    // Subtract to compute the number of appearances of each id.
+    ComputeItvLength(device, num_partitions_, start_ptr, end_ptr);
+  }  // At this point indices_in and partitions_out will be marked
+     // for deallocation.
+
+  void GatherSlices(OpKernelContext* c, const Tensor* data,
+                    const Tensor* indices, int32 N, int64 slice_size,
+                    OpOutputList& outs) {
+    const GPUDevice& device = c->eigen_device<GPUDevice>();
+    const int32* ind_base = indices->flat<int32>().data();
+    const T* data_base = data->flat<T>().data();
+
+    for (int p = 0; p < num_partitions_; p++) {
+      int32 indices_size = outs[p]->dim_size(0);
+      int64 out_size = outs[p]->NumElements();
+      T* out_base = outs[p]->flat<T>().data();
+      if (out_size > 0)
+        CallGatherKernel<T>(device, data_base, ind_base, out_base, N,
+                            indices_size, slice_size, out_size);
+      ind_base += indices_size;
+    }
+  }
+
+  int num_partitions_;
+};
+
+#define REGISTER_DYNAMIC_PARTITION_GPU(T)                                 \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("DynamicPartition").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      DynamicPartitionOpGPU<T>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_DYNAMIC_PARTITION_GPU);
+TF_CALL_complex64(REGISTER_DYNAMIC_PARTITION_GPU);
+TF_CALL_complex128(REGISTER_DYNAMIC_PARTITION_GPU);
+#undef REGISTER_DYNAMIC_PARTITION_GPU
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/dynamic_partition_op_test.cc b/tensorflow/core/kernels/dynamic_partition_op_test.cc
index 0e8fbc0a67..9a7ed0af21 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_test.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -23,10 +24,14 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
 namespace {
@@ -153,5 +158,58 @@ TEST_F(DynamicPartitionOpTest, Error_IndexOutOfRange) {
       << s;
 }
 
+Node* DynamicPartitionNode(Graph* g, Node* in0, Node* in1, int num_partitions) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "DynamicPartition")
+                  .Input(in0)
+                  .Input(in1)
+                  .Attr("num_partitions", num_partitions)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
+template <typename T>
+static Graph* DynamicPartition(int num_partitions, int dim) {
+  Graph* g = new Graph(OpRegistry::Global());
+  // Always use a 128MB buffer.
+  const int kRows = ((128 << 20) / sizeof(T)) / dim;
+  Tensor data(DataTypeToEnum<T>::value, TensorShape({kRows, dim}));
+  data.flat<T>().setRandom();
+
+  random::PhiloxRandom philox(301, 17);
+  random::SimplePhilox rnd(&philox);
+  Tensor partitions(DT_INT32, TensorShape({kRows}));
+  for (int i = 0; i < kRows; i++) {
+    partitions.flat<int32>()(i) = rnd.Uniform(num_partitions);
+  }
+  DynamicPartitionNode(g, test::graph::Constant(g, data),
+                       test::graph::Constant(g, partitions), num_partitions);
+  return g;
+}
+
+#define BM_DYNAMIC_PARTITION(DEVICE, T, num)                            \
+  static void BM_##DEVICE##_dynpart_##T##_##num(int iters, int dim) {   \
+    const int64 items = ((128 << 20) / sizeof(T));                      \
+    const int64 tot = static_cast<int64>(iters) * items;                \
+    testing::ItemsProcessed(tot);                                       \
+    testing::UseRealTime();                                             \
+    test::Benchmark(#DEVICE, DynamicPartition<T>(num, dim)).Run(iters); \
+  }                                                                     \
+  BENCHMARK(BM_##DEVICE##_dynpart_##T##_##num)->Arg(1)->Arg(256)
+
+BM_DYNAMIC_PARTITION(cpu, float, 2);
+BM_DYNAMIC_PARTITION(cpu, float, 100);
+BM_DYNAMIC_PARTITION(cpu, double, 2);
+BM_DYNAMIC_PARTITION(cpu, double, 100);
+BM_DYNAMIC_PARTITION(cpu, complex64, 2);
+BM_DYNAMIC_PARTITION(cpu, complex64, 100);
+
+BM_DYNAMIC_PARTITION(gpu, float, 2);
+BM_DYNAMIC_PARTITION(gpu, float, 100);
+BM_DYNAMIC_PARTITION(gpu, double, 2);
+BM_DYNAMIC_PARTITION(gpu, double, 100);
+BM_DYNAMIC_PARTITION(gpu, complex64, 2);
+BM_DYNAMIC_PARTITION(gpu, complex64, 100);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fake_quant_ops_functor.h b/tensorflow/core/kernels/fake_quant_ops_functor.h
index 7aaad6e6c7..b41b22d634 100644
--- a/tensorflow/core/kernels/fake_quant_ops_functor.h
+++ b/tensorflow/core/kernels/fake_quant_ops_functor.h
@@ -132,7 +132,7 @@ struct FakeQuantWithMinMaxVarsFunctor {
     const float max_val = max();
     // If min and max are both zero, we should just return zero.
     if (min_val == 0.0f && max_val == 0.0f) {
-      outputs.device(d) = outputs.constant(0.0f);
+      outputs.setZero();
       return;
     }
     float nudged_min, nudged_max, nudged_scale;
@@ -163,8 +163,8 @@ struct FakeQuantWithMinMaxVarsGradientFunctor {
     // If min and max are both zero, we propagate everything to inputs.
     if (min_val == 0.0f && max_val == 0.0f) {
       backprops_wrt_input.device(d) = gradients;
-      backprop_wrt_min.device(d) = backprop_wrt_min.constant(0.0f);
-      backprop_wrt_max.device(d) = backprop_wrt_max.constant(0.0f);
+      backprop_wrt_min.setZero();
+      backprop_wrt_max.setZero();
       return;
     }
     float nudged_min, nudged_max, nudged_scale;
@@ -205,8 +205,7 @@ struct FakeQuantWithMinMaxVarsPerChannelFunctor {
       const float max_val = max(i);
       // If min and max are both zero, we should just return zero.
       if (min_val == 0.0f && max_val == 0.0f) {
-        auto chip = outputs.chip<1>(i);
-        chip.device(d) = chip.constant(0.0f);
+        outputs.chip<1>(i).setZero();
         continue;
       }
       float nudged_min, nudged_max, nudged_scale;
@@ -243,10 +242,8 @@ struct FakeQuantWithMinMaxVarsPerChannelGradientFunctor {
       // If min and max are both zero, we propagate everything to inputs.
       if (min_val == 0.0f && max_val == 0.0f) {
         backprops_wrt_input.chip<1>(i).device(d) = gradients_chip;
-        auto min_chip = backprop_wrt_min.chip<0>(i);
-        auto max_chip = backprop_wrt_max.chip<0>(i);
-        min_chip.device(d) = min_chip.constant(0.0f);
-        max_chip.device(d) = max_chip.constant(0.0f);
+        backprop_wrt_min.chip<0>(i).setZero();
+        backprop_wrt_max.chip<0>(i).setZero();
         continue;
       }
       float nudged_min, nudged_max, nudged_scale;
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index 0ecb829f34..1688674eb7 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -54,25 +54,20 @@ struct FusedBatchNorm<CPUDevice, T, U> {
                   Tensor* batch_var_output, Tensor* saved_mean_output,
                   Tensor* saved_var_output, TensorFormat tensor_format,
                   bool is_training) {
-    // Currently U is ignored, since we only support the case where T and U are
-    // both float32.
-    // TODO(reedwm): Add float16 support, use U, and remove these asserts.
-    static_assert(std::is_same<T, float>::value, "T currently must be float.");
-    static_assert(std::is_same<U, float>::value, "U currently must be float.");
     OP_REQUIRES(context, tensor_format == FORMAT_NHWC,
                 errors::Internal("The CPU implementation of FusedBatchNorm "
                                  "only supports NHWC tensor format for now."));
     typename TTypes<T, 4>::ConstTensor x(x_input.tensor<T, 4>());
-    typename TTypes<T>::ConstVec scale(scale_input.vec<T>());
-    typename TTypes<T>::ConstVec offset(offset_input.vec<T>());
-    typename TTypes<T>::ConstVec estimated_mean(estimated_mean_input.vec<T>());
-    typename TTypes<T>::ConstVec estimated_variance(
-        estimated_variance_input.vec<T>());
+    typename TTypes<U>::ConstVec scale(scale_input.vec<U>());
+    typename TTypes<U>::ConstVec offset(offset_input.vec<U>());
+    typename TTypes<U>::ConstVec estimated_mean(estimated_mean_input.vec<U>());
+    typename TTypes<U>::ConstVec estimated_variance(
+        estimated_variance_input.vec<U>());
     typename TTypes<T, 4>::Tensor y(y_output->tensor<T, 4>());
-    typename TTypes<T>::Vec batch_mean(batch_mean_output->vec<T>());
-    typename TTypes<T>::Vec batch_var(batch_var_output->vec<T>());
-    typename TTypes<T>::Vec saved_mean(saved_mean_output->vec<T>());
-    typename TTypes<T>::Vec saved_var(saved_var_output->vec<T>());
+    typename TTypes<U>::Vec batch_mean(batch_mean_output->vec<U>());
+    typename TTypes<U>::Vec batch_var(batch_var_output->vec<U>());
+    typename TTypes<U>::Vec saved_mean(saved_mean_output->vec<U>());
+    typename TTypes<U>::Vec saved_var(saved_var_output->vec<U>());
 
     const CPUDevice& d = context->eigen_device<CPUDevice>();
 
@@ -93,15 +88,15 @@ struct FusedBatchNorm<CPUDevice, T, U> {
     bcast_spec.set(0, rest_size);
 #endif
 
-    auto x_rest_by_depth = x.reshape(rest_by_depth);
+    auto x_rest_by_depth = x.reshape(rest_by_depth).template cast<U>();
     const int rest_size_minus_one = (rest_size > 1) ? (rest_size - 1) : 1;
-    T rest_size_inv = static_cast<T>(1.0f / static_cast<T>(rest_size));
+    U rest_size_inv = static_cast<U>(1.0f / static_cast<U>(rest_size));
     // This adjustment is for Bessel's correction
-    T rest_size_adjust =
-        static_cast<T>(rest_size) / static_cast<T>(rest_size_minus_one);
+    U rest_size_adjust =
+        static_cast<U>(rest_size) / static_cast<U>(rest_size_minus_one);
 
-    Eigen::Tensor<T, 1, Eigen::RowMajor> mean(depth);
-    Eigen::Tensor<T, 1, Eigen::RowMajor> variance(depth);
+    Eigen::Tensor<U, 1, Eigen::RowMajor> mean(depth);
+    Eigen::Tensor<U, 1, Eigen::RowMajor> variance(depth);
     if (is_training) {
       mean.device(d) = (x_rest_by_depth.sum(reduce_dims) * rest_size_inv);
       batch_mean.device(d) = mean;
@@ -129,7 +124,7 @@ struct FusedBatchNorm<CPUDevice, T, U> {
     auto x_shifted =
         x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec);
 
-    y.reshape(rest_by_depth).device(d) = x_shifted;
+    y.reshape(rest_by_depth).device(d) = x_shifted.template cast<T>();
   }
 };
 
@@ -138,7 +133,7 @@ struct FusedBatchNormGrad<CPUDevice, T, U> {
   void operator()(OpKernelContext* context, const Tensor& y_backprop_input,
                   const Tensor& x_input, const Tensor& scale_input,
                   const Tensor& mean_input, const Tensor& variance_input,
-                  T epsilon, Tensor* x_backprop_output,
+                  U epsilon, Tensor* x_backprop_output,
                   Tensor* scale_backprop_output, Tensor* offset_backprop_output,
                   TensorFormat tensor_format) {
     OP_REQUIRES(context, tensor_format == FORMAT_NHWC,
@@ -147,12 +142,12 @@ struct FusedBatchNormGrad<CPUDevice, T, U> {
     typename TTypes<T, 4>::ConstTensor y_backprop(
         y_backprop_input.tensor<T, 4>());
     typename TTypes<T, 4>::ConstTensor x(x_input.tensor<T, 4>());
-    typename TTypes<T>::ConstVec scale(scale_input.vec<T>());
-    typename TTypes<T>::ConstVec mean(mean_input.vec<T>());
-    typename TTypes<T>::ConstVec variance(variance_input.vec<T>());
+    typename TTypes<U>::ConstVec scale(scale_input.vec<U>());
+    typename TTypes<U>::ConstVec mean(mean_input.vec<U>());
+    typename TTypes<U>::ConstVec variance(variance_input.vec<U>());
     typename TTypes<T, 4>::Tensor x_backprop(x_backprop_output->tensor<T, 4>());
-    typename TTypes<T>::Vec scale_backprop(scale_backprop_output->vec<T>());
-    typename TTypes<T>::Vec offset_backprop(offset_backprop_output->vec<T>());
+    typename TTypes<U>::Vec scale_backprop(scale_backprop_output->vec<U>());
+    typename TTypes<U>::Vec offset_backprop(offset_backprop_output->vec<U>());
 
     // Note: the following formulas are used to compute the gradients for
     // back propagation.
@@ -181,8 +176,8 @@ struct FusedBatchNormGrad<CPUDevice, T, U> {
     bcast_spec.set(0, rest_size);
 #endif
 
-    auto x_rest_by_depth = x.reshape(rest_by_depth);
-    T rest_size_inv = static_cast<T>(1.0f / static_cast<T>(rest_size));
+    auto x_rest_by_depth = x.reshape(rest_by_depth).template cast<U>();
+    U rest_size_inv = static_cast<U>(1.0f / static_cast<U>(rest_size));
 
     auto x_mean_rest_by_depth =
         mean.reshape(one_by_depth).broadcast(bcast_spec);
@@ -192,7 +187,8 @@ struct FusedBatchNormGrad<CPUDevice, T, U> {
         coef0.eval().reshape(one_by_depth).broadcast(bcast_spec);
     auto x_scaled = x_centered * coef0_rest_by_depth;
 
-    auto y_backprop_rest_by_depth = y_backprop.eval().reshape(rest_by_depth);
+    auto y_backprop_rest_by_depth =
+        y_backprop.eval().reshape(rest_by_depth).template cast<U>();
     scale_backprop.device(d) =
         (y_backprop_rest_by_depth * x_scaled).sum(reduce_dims);
     auto y_backprop_sum = y_backprop_rest_by_depth.sum(reduce_dims);
@@ -214,7 +210,7 @@ struct FusedBatchNormGrad<CPUDevice, T, U> {
                      .reshape(one_by_depth)
                      .broadcast(bcast_spec);
     x_backprop.reshape(rest_by_depth).device(d) =
-        coef1 * (y_backprop_centered - x_centered * coef2);
+        (coef1 * (y_backprop_centered - x_centered * coef2)).template cast<T>();
   }
 };
 
@@ -689,6 +685,18 @@ REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV2")
                             .TypeConstraint<float>("U"),
                         FusedBatchNormGradOp<CPUDevice, float, float>);
 
+REGISTER_KERNEL_BUILDER(Name("FusedBatchNormV2")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<Eigen::half>("T")
+                            .TypeConstraint<float>("U"),
+                        FusedBatchNormOp<CPUDevice, Eigen::half, float>);
+
+REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV2")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<Eigen::half>("T")
+                            .TypeConstraint<float>("U"),
+                        FusedBatchNormGradOp<CPUDevice, Eigen::half, float>);
+
 #if GOOGLE_CUDA
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.h b/tensorflow/core/kernels/fused_batch_norm_op.h
index 38b24d7011..3af104bf95 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.h
+++ b/tensorflow/core/kernels/fused_batch_norm_op.h
@@ -92,26 +92,28 @@ struct FusedBatchNormFreezeGrad {
     // offset_backprop  = sum(y_backprop)
     // scale_backprop = y_backprop * ((x - pop_mean) * rsqrt(pop_var + epsilon))
     // x_backprop = y_backprop * (scale * rsqrt(pop_var + epsilon))
-    offset_backprop.device(d) = y_backprop.reshape(rest_by_depth)
-                                    .template cast<U>()
-                                    .sum(reduction_axis);
+
+    auto y_backprop_rest_by_depth =
+        y_backprop.reshape(rest_by_depth).template cast<U>();
+    auto input_rest_by_depth = input.reshape(rest_by_depth).template cast<U>();
+
+    offset_backprop.device(d) = y_backprop_rest_by_depth.sum(reduction_axis);
 
     // scratch1 = rsqrt(pop_var + epsilon)
     scratch1.device(d) = (pop_var + pop_var.constant(epsilon)).rsqrt();
 
     // scratch2 = sum(y_backprop * (x - mean))
     scratch2.device(d) =
-        (y_backprop.reshape(rest_by_depth).template cast<U>() *
-         (input.reshape(rest_by_depth).template cast<U>() -
+        (y_backprop_rest_by_depth *
+         (input_rest_by_depth -
           pop_mean.reshape(one_by_depth).broadcast(rest_by_one)))
             .sum(reduction_axis);
 
     x_backprop.reshape(rest_by_depth).device(d) =
-        (y_backprop.reshape(rest_by_depth).template cast<U>() *
-         ((scratch1 * scale)
-              .eval()
-              .reshape(one_by_depth)
-              .broadcast(rest_by_one)))
+        (y_backprop_rest_by_depth * ((scratch1 * scale)
+                                         .eval()
+                                         .reshape(one_by_depth)
+                                         .broadcast(rest_by_one)))
             .template cast<T>();
     scale_backprop.device(d) = scratch2 * scratch1;
   }
diff --git a/tensorflow/core/kernels/lmdb_reader_op.cc b/tensorflow/core/kernels/lmdb_reader_op.cc
index 3bb07301b5..31a427f2c9 100755
--- a/tensorflow/core/kernels/lmdb_reader_op.cc
+++ b/tensorflow/core/kernels/lmdb_reader_op.cc
@@ -36,7 +36,7 @@ class LMDBReader : public ReaderBase {
 
   Status OnWorkStartedLocked() override {
     MDB_CHECK(mdb_env_create(&mdb_env_));
-    int flags = MDB_RDONLY | MDB_NOTLS;
+    int flags = MDB_RDONLY | MDB_NOTLS | MDB_NOLOCK;
 
     // Check if the LMDB filename is actually a file instead of a directory.
     // If so, set appropriate flags so we can open it.
@@ -57,10 +57,13 @@ class LMDBReader : public ReaderBase {
     if (mdb_env_ != nullptr) {
       if (mdb_cursor_) {
         mdb_cursor_close(mdb_cursor_);
+        mdb_cursor_ = nullptr;
       }
-      mdb_txn_abort(mdb_txn_);
       mdb_dbi_close(mdb_env_, mdb_dbi_);
+      mdb_txn_abort(mdb_txn_);
       mdb_env_close(mdb_env_);
+      mdb_txn_ = nullptr;
+      mdb_dbi_ = 0;
       mdb_env_ = nullptr;
     }
     return Status::OK();
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index e2cf605811..157ce106ce 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/maxpooling_op.h"
 
 #include <vector>
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -34,9 +33,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/pooling_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
@@ -358,6 +359,7 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
 
     use_dnn_ = CanUseCudnn();
+    ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false, &propagate_nans_);
   }
 
   void Compute(OpKernelContext* context) override {
@@ -405,7 +407,7 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
       DnnPoolingGradOp<T>::Compute(
           context, perftools::gputools::dnn::PoolingMode::kMaximum, ksize,
           stride, padding_, data_format_, &tensor_in, &tensor_out, out_backprop,
-          output_shape);
+          output_shape, propagate_nans_);
     } else {
       CHECK(data_format_ == FORMAT_NHWC)
           << "Non-Cudnn MaxPoolGrad only supports NHWC format";
@@ -420,6 +422,7 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
   Padding padding_;
   TensorFormat data_format_;
   bool use_dnn_;
+  bool propagate_nans_;
 };
 
 #endif  // GOOGLE_CUDA
@@ -884,6 +887,8 @@ class MaxPoolingWithArgmaxOp : public OpKernel {
     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
+
+    ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false, &propagate_nans_);
   }
 
   void Compute(OpKernelContext* context) override {
@@ -902,14 +907,15 @@ class MaxPoolingWithArgmaxOp : public OpKernel {
     Tensor* argmax = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(1, out_shape, &argmax));
 
-    LaunchMaxPoolingWithArgmax<Device, T>::launch(context, params, tensor_in,
-                                                  output, argmax);
+    LaunchMaxPoolingWithArgmax<Device, T>::launch(
+        context, params, tensor_in, output, argmax, propagate_nans_);
   }
 
  private:
   std::vector<int32> ksize_;
   std::vector<int32> stride_;
   Padding padding_;
+  bool propagate_nans_;
 };
 
 template <typename Device, typename T>
@@ -1045,6 +1051,8 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
     use_dnn_ = CanUseCudnn();
+
+    ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false, &propagate_nans_);
   }
 
   void Compute(OpKernelContext* context) override {
@@ -1068,9 +1076,10 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
 
     // These is_int8x4 checks avoid linker errors for missing qint8 kernels.
     if (!is_int8x4 && use_dnn_ && data_format_ == FORMAT_NCHW) {
-      DnnPoolingOp<T>::Compute(
-          context, perftools::gputools::dnn::PoolingMode::kMaximum, ksize_,
-          stride_, padding_, data_format_, tensor_in, out_shape);
+      DnnPoolingOp<T>::Compute(context,
+                               perftools::gputools::dnn::PoolingMode::kMaximum,
+                               ksize_, stride_, padding_, data_format_,
+                               tensor_in, out_shape, propagate_nans_);
     } else {
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
@@ -1079,7 +1088,7 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
                                                            tensor_in, output);
       } else if (data_format_ == FORMAT_NHWC) {
         LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
-                                                  output);
+                                                  output, propagate_nans_);
       } else {
         LOG(FATAL) << "MaxPool currently only supports the following (layout, "
                       "type) combinations: (NHWC, non-qint8), "
@@ -1098,6 +1107,7 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
   Padding padding_;
   TensorFormat data_format_;
   bool use_dnn_;
+  bool propagate_nans_;
 };
 
 template <typename T>
@@ -1127,6 +1137,7 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
     }
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
     use_dnn_ = CanUseCudnn();
+    ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false, &propagate_nans_);
   }
 
   void Compute(OpKernelContext* context) override {
@@ -1168,16 +1179,17 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
         ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height,
                         params.out_width, params.depth);
     if (use_dnn_ && data_format_ == FORMAT_NCHW) {
-      DnnPoolingOp<T>::Compute(
-          context, perftools::gputools::dnn::PoolingMode::kMaximum, ksize,
-          stride, padding_, data_format_, tensor_in, out_shape);
+      DnnPoolingOp<T>::Compute(context,
+                               perftools::gputools::dnn::PoolingMode::kMaximum,
+                               ksize, stride, padding_, data_format_, tensor_in,
+                               out_shape, propagate_nans_);
     } else {
       CHECK(data_format_ == FORMAT_NHWC)
           << "Non-Cudnn MaxPool only supports NHWC format";
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
       LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
-                                                output);
+                                                output, propagate_nans_);
     }
   }
 
@@ -1187,18 +1199,20 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
   Padding padding_;
   TensorFormat data_format_;
   bool use_dnn_;
+  bool propagate_nans_;
 };
 
 template <typename T>
 struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
   static void launch(OpKernelContext* context, const PoolParameters& params,
-                     const Tensor& input, Tensor* output) {
+                     const Tensor& input, Tensor* output, bool propagate_nans) {
     bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()(
         input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
         params.tensor_in_cols, params.depth, params.out_height,
         params.out_width, params.window_rows, params.window_cols,
         params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
-        output->flat<T>().data(), nullptr, context->eigen_gpu_device());
+        output->flat<T>().data(), nullptr, context->eigen_gpu_device(),
+        propagate_nans);
     if (!status) {
       context->SetStatus(
           errors::Internal("Failed launching MaxPoolForwardNoMask"));
@@ -1209,7 +1223,8 @@ struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
 template <typename T>
 struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
   static void launch(OpKernelContext* context, const PoolParameters& params,
-                     const Tensor& input, Tensor* output, Tensor* argmax) {
+                     const Tensor& input, Tensor* output, Tensor* argmax,
+                     bool propagate_nans) {
     bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()(
         input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
         params.tensor_in_cols, params.depth, params.out_height,
@@ -1217,7 +1232,7 @@ struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
         params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
         output->flat<T>().data(),
         reinterpret_cast<int64*>(argmax->flat<int64>().data()),
-        context->eigen_gpu_device());
+        context->eigen_gpu_device(), propagate_nans);
     if (!status) {
       context->SetStatus(
           errors::Internal("Failed launching MaxPoolForwardWithArgmax"));
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index 26f5274804..d96b844383 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -29,6 +29,15 @@ limitations under the License.
 
 namespace tensorflow {
 namespace {
+template <bool propagate_nans, typename dtype>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool IsGreaterThan(dtype a, dtype b) {
+  if (propagate_nans) {
+    return !(a <= b);
+  } else {
+    return a > b;
+  }
+}
+
 // This is Yangqing's custom kernel for the maxpooling operation. There are
 // three functions: MaxPoolForwardNCHW and MaxPoolForwardNHWC are the two
 // forward functions, dealing with the forward case. MaxPoolBackward is the
@@ -51,7 +60,7 @@ namespace {
 // const int output_size = batch * channels * pooled_height * pooled_width;
 // MaxPoolForwardNCHW<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
 //                      kThreadsPerBlock, 0, cuda_stream>>>(...);
-template <typename dtype>
+template <bool propagate_nans, typename dtype>
 __global__ void MaxPoolForwardNCHW(const int nthreads, const dtype* bottom_data,
                                    const int channels, const int height,
                                    const int width, const int pooled_height,
@@ -77,7 +86,7 @@ __global__ void MaxPoolForwardNCHW(const int nthreads, const dtype* bottom_data,
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
         int idx = c * height * width + h * width + w;
-        if (bottom_data_n[idx] > maxval) {
+        if (IsGreaterThan<propagate_nans>(bottom_data_n[idx], maxval)) {
           maxidx = idx;
           maxval = bottom_data_n[idx];
         }
@@ -126,7 +135,7 @@ __global__ void MaxPoolForwardNoMaskKernel_NCHW_VECT_C(
   }
 }
 
-template <typename dtype>
+template <bool propagate_nans, typename dtype>
 __global__ void MaxPoolForwardNHWC(const int nthreads, const dtype* bottom_data,
                                    const int height, const int width,
                                    const int channels, const int pooled_height,
@@ -153,7 +162,7 @@ __global__ void MaxPoolForwardNHWC(const int nthreads, const dtype* bottom_data,
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
         int idx = (h * width + w) * channels + c;
-        if (bottom_data_n[idx] > maxval) {
+        if (IsGreaterThan<propagate_nans>(bottom_data_n[idx], maxval)) {
           maxidx = idx;
           maxval = bottom_data_n[idx];
         }
@@ -390,15 +399,24 @@ bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
     const int channels, const int pooled_height, const int pooled_width,
     const int kernel_h, const int kernel_w, const int stride_h,
     const int stride_w, const int pad_t, const int pad_l, T* top_data,
-    int64* mask, const Eigen::GpuDevice& d) {
+    int64* mask, const Eigen::GpuDevice& d, bool propagate_nans) {
   const int kThreadsPerBlock = 1024;
   const int output_size = batch * channels * pooled_height * pooled_width;
-
-  MaxPoolForwardNHWC<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-                       kThreadsPerBlock, 0, d.stream()>>>(
-      output_size, bottom_data, height, width, channels, pooled_height,
-      pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
-      top_data, mask);
+  if (propagate_nans) {
+    MaxPoolForwardNHWC<true>
+        <<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+           kThreadsPerBlock, 0, d.stream()>>>
+        (output_size, bottom_data, height, width, channels, pooled_height,
+         pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
+         top_data, mask);
+  } else {
+    MaxPoolForwardNHWC<false>
+        <<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+           kThreadsPerBlock, 0, d.stream()>>>
+        (output_size, bottom_data, height, width, channels, pooled_height,
+         pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
+         top_data, mask);
+  }
   return d.ok();
 }
 
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.h b/tensorflow/core/kernels/maxpooling_op_gpu.h
index 34203797cf..38ebb34248 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.h
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.h
@@ -39,7 +39,7 @@ struct MaxPoolForwardWithOptionalArgmax {
                   const int pooled_width, const int kernel_h,
                   const int kernel_w, const int stride_h, const int stride_w,
                   const int pad_t, const int pad_l, T* top_data, int64* mask,
-                  const Eigen::GpuDevice& d);
+                  const Eigen::GpuDevice& d, bool propagate_nans);
 };
 
 struct MaxPoolForwardNoMask_NCHW_VECT_C {
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index 9080bf7be8..f291281108 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -45,12 +45,12 @@ limitations under the License.
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
 
-using mkldnn::prop_kind;
 using mkldnn::stream;
+using mkldnn::prop_kind;
 
+using mkldnn::convolution_forward;
 using mkldnn::convolution_backward_weights;
 using mkldnn::convolution_direct;
-using mkldnn::convolution_forward;
 
 #endif
 
@@ -463,13 +463,12 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
 
       // Generate input shapes.
       TensorShape filter_shape;
-      OP_REQUIRES(
-          context, TensorShapeUtils::IsVector(filter_tensor.shape()),
-          errors::InvalidArgument(
+      OP_REQUIRES(context, TensorShapeUtils::IsVector(filter_tensor.shape()),
+        errors::InvalidArgument(
               "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
               filter_tensor.dims()));
       OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                                  filter_tensor.vec<int32>(), &filter_shape));
+                        filter_tensor.vec<int32>(), &filter_shape));
       TensorShape input_shape = input_tensor.shape();
       TensorShape obp_shape = obp_tensor.shape();
 
@@ -481,26 +480,27 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
 
       // Get forward convolution parameters.
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
-      conv_utl.GetConvFwdSizesInMklOrder(
-          input_shape, filter_shape, &fwd_input_dims, &fwd_filter_dims,
-          &strides, &fwd_output_dims_tf_order, &fwd_output_dims, &padding_l,
-          &padding_r);
+      conv_utl.GetConvFwdSizesInMklOrder(input_shape, filter_shape,
+                                         &fwd_input_dims, &fwd_filter_dims,
+                                         &strides,
+                                         &fwd_output_dims_tf_order,
+                                         &fwd_output_dims,
+                                         &padding_l, &padding_r);
       if (!context->status().ok()) return;
 
       // Create Convolution forward descriptor since Convolution backward
       // API needs it. For that, we first need to create input, filter
       // and output memory descriptors.
       auto mkl_data_format = TFDataFormatToMklDnnDataFormat(data_format_);
-      auto fwd_src_md =
-          memory::desc(fwd_input_dims, MklDnnType<T>(), mkl_data_format);
-      auto fwd_filter_md =
-          memory::desc(fwd_filter_dims, MklDnnType<T>(), memory::format::hwio);
-      auto fwd_out_md =
-          memory::desc(fwd_output_dims, MklDnnType<T>(), mkl_data_format);
-      auto fwd_desc = convolution_forward::desc(
-          prop_kind::forward, convolution_direct, fwd_src_md, fwd_filter_md,
-          fwd_out_md, strides, padding_l, padding_r,
-          TFPaddingToMklDnnPadding(padding_));
+      auto fwd_src_md = memory::desc(fwd_input_dims, MklDnnType<T>(),
+                                     mkl_data_format);
+      auto fwd_filter_md = memory::desc(fwd_filter_dims, MklDnnType<T>(),
+                                        memory::format::hwio);
+      auto fwd_out_md = memory::desc(fwd_output_dims, MklDnnType<T>(),
+                                     mkl_data_format);
+      auto fwd_desc = convolution_forward::desc(prop_kind::forward,
+            convolution_direct, fwd_src_md, fwd_filter_md, fwd_out_md,
+            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
       auto fwd_pd = convolution_forward::primitive_desc(fwd_desc, cpu_engine);
 
       // Allocate output tensor and shape
@@ -537,22 +537,23 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
       output.SetOpMemDesc(bwd_output_dims, memory::format::any);
 
       // Create convolution backward weights primitive.
-      auto bwd_desc = convolution_backward_weights::desc(
-          convolution_direct, input.GetOpMemDesc(), output.GetOpMemDesc(),
-          outbackprop.GetOpMemDesc(), strides, padding_l, padding_r,
-          TFPaddingToMklDnnPadding(padding_));
+      auto bwd_desc = convolution_backward_weights::desc(convolution_direct,
+                          input.GetOpMemDesc(), output.GetOpMemDesc(),
+                          outbackprop.GetOpMemDesc(), strides, padding_l,
+                          padding_r, TFPaddingToMklDnnPadding(padding_));
 
-      auto bwd_pd = convolution_backward_weights::primitive_desc(
-          bwd_desc, cpu_engine, fwd_pd);
+      auto bwd_pd = convolution_backward_weights::primitive_desc(bwd_desc,
+                                                              cpu_engine,
+                                                              fwd_pd);
 
       PrepareAndExecutePrimitive(bwd_pd, &input, &outbackprop, &output);
-    } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
-      OP_REQUIRES_OK(
-          context,
-          errors::Aborted("Operation received an exception:", error_msg));
+    } catch (mkldnn::error &e) {
+     string error_msg = "Status: " + std::to_string(e.status) +
+                       ", message: " + string(e.message) +
+                       ", in file " + string(__FILE__) + ":" +
+                       std::to_string(__LINE__);
+     OP_REQUIRES_OK(context, errors::Aborted("Operation received an exception:",
+                                            error_msg));
     }
   }
 
@@ -563,8 +564,9 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecutePrimitive(
-      const convolution_backward_weights::primitive_desc& conv_pd,
-      MklDnnData<T>* input, MklDnnData<T>* obp, MklDnnData<T>* output) {
+                  const convolution_backward_weights::primitive_desc& conv_pd,
+                  MklDnnData<T>* input, MklDnnData<T>* obp,
+                  MklDnnData<T>* output) {
     // Create reorders between user layout and MKL layout if it is needed and
     // add it to the net before convolution.
     std::vector<primitive> net;
@@ -575,10 +577,10 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
     // output side, we will prepare reorder primitive in case output
     // reorder to user memory is required.
     bool output_reorder_required = output->PrepareReorderToUserMemIfReq(
-        conv_pd.diff_weights_primitive_desc());
+                                      conv_pd.diff_weights_primitive_desc());
 
-    net.push_back(convolution_backward_weights(
-        conv_pd, input->GetOpMem(), obp->GetOpMem(), output->GetOpMem()));
+    net.push_back(convolution_backward_weights(conv_pd, input->GetOpMem(),
+                                    obp->GetOpMem(), output->GetOpMem()));
 
     // Insert reorder primitive in the net for output reorder if reorder is
     // required.
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index 4b6bf92e42..4a47d0463e 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -23,8 +23,6 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 #include <algorithm>
 #include <vector>
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -43,16 +41,18 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
 
-using mkldnn::prop_kind;
 using mkldnn::stream;
+using mkldnn::prop_kind;
 
-using mkldnn::convolution_backward_data;
-using mkldnn::convolution_direct;
 using mkldnn::convolution_forward;
+using mkldnn::convolution_direct;
+using mkldnn::convolution_backward_data;
 #endif
 
 namespace tensorflow {
@@ -397,13 +397,12 @@ class MklConv2DCustomBackpropInputOp : public OpKernel {
 
       // Generate input shape.
       TensorShape input_shape;
-      OP_REQUIRES(
-          context, TensorShapeUtils::IsVector(input_tensor.shape()),
-          errors::InvalidArgument(
+      OP_REQUIRES(context, TensorShapeUtils::IsVector(input_tensor.shape()),
+        errors::InvalidArgument(
               "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
               input_tensor.dims()));
       OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                                  input_tensor.vec<int32>(), &input_shape));
+                        input_tensor.vec<int32>(), &input_shape));
       TensorShape filter_shape = filter_tensor.shape();
       TensorShape obp_shape = obp_tensor.shape();
 
@@ -415,26 +414,27 @@ class MklConv2DCustomBackpropInputOp : public OpKernel {
 
       // Get forward convolution parameters.
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
-      conv_utl.GetConvFwdSizesInMklOrder(
-          input_shape, filter_shape, &fwd_input_dims, &fwd_filter_dims,
-          &strides, &fwd_output_dims_tf_order, &fwd_output_dims, &padding_l,
-          &padding_r);
+      conv_utl.GetConvFwdSizesInMklOrder(input_shape, filter_shape,
+                                         &fwd_input_dims, &fwd_filter_dims,
+                                         &strides,
+                                         &fwd_output_dims_tf_order,
+                                         &fwd_output_dims,
+                                         &padding_l, &padding_r);
       if (!context->status().ok()) return;
 
       // Create Convolution forward descriptor since Convolution backward
       // API needs it. For that, we first need to create input, filter
       // and output memory descriptors.
       auto mkl_data_format = TFDataFormatToMklDnnDataFormat(data_format_);
-      auto fwd_src_md =
-          memory::desc(fwd_input_dims, MklDnnType<T>(), mkl_data_format);
-      auto fwd_filter_md =
-          memory::desc(fwd_filter_dims, MklDnnType<T>(), memory::format::hwio);
-      auto fwd_out_md =
-          memory::desc(fwd_output_dims, MklDnnType<T>(), mkl_data_format);
-      auto fwd_desc = convolution_forward::desc(
-          prop_kind::forward, convolution_direct, fwd_src_md, fwd_filter_md,
-          fwd_out_md, strides, padding_l, padding_r,
-          TFPaddingToMklDnnPadding(padding_));
+      auto fwd_src_md = memory::desc(fwd_input_dims, MklDnnType<T>(),
+                                     mkl_data_format);
+      auto fwd_filter_md = memory::desc(fwd_filter_dims, MklDnnType<T>(),
+                                        memory::format::hwio);
+      auto fwd_out_md = memory::desc(fwd_output_dims, MklDnnType<T>(),
+                                     mkl_data_format);
+      auto fwd_desc = convolution_forward::desc(prop_kind::forward,
+            convolution_direct, fwd_src_md, fwd_filter_md, fwd_out_md,
+            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
       auto fwd_pd = convolution_forward::primitive_desc(fwd_desc, cpu_engine);
 
       // Allocate output tensor and shape
@@ -475,22 +475,23 @@ class MklConv2DCustomBackpropInputOp : public OpKernel {
       output.SetOpMemDesc(bwd_output_dims, memory::format::any);
 
       // Create convolution backward data primitive.
-      auto bwd_desc = convolution_backward_data::desc(
-          convolution_direct, output.GetOpMemDesc(), filter.GetOpMemDesc(),
-          outbackprop.GetOpMemDesc(), strides, padding_l, padding_r,
-          TFPaddingToMklDnnPadding(padding_));
+      auto bwd_desc = convolution_backward_data::desc(convolution_direct,
+                          output.GetOpMemDesc(), filter.GetOpMemDesc(),
+                          outbackprop.GetOpMemDesc(), strides, padding_l,
+                          padding_r, TFPaddingToMklDnnPadding(padding_));
 
-      auto bwd_pd = convolution_backward_data::primitive_desc(
-          bwd_desc, cpu_engine, fwd_pd);
+      auto bwd_pd = convolution_backward_data::primitive_desc(bwd_desc,
+                                                              cpu_engine,
+                                                              fwd_pd);
 
       PrepareAndExecutePrimitive(bwd_pd, &filter, &outbackprop, &output);
-    } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
-      OP_REQUIRES_OK(
-          context,
-          errors::Aborted("Operation received an exception:", error_msg));
+    } catch (mkldnn::error &e) {
+     string error_msg = "Status: " + std::to_string(e.status) +
+                       ", message: " + string(e.message) +
+                       ", in file " + string(__FILE__) + ":" +
+                       std::to_string(__LINE__);
+     OP_REQUIRES_OK(context, errors::Aborted("Operation received an exception:",
+                                            error_msg));
     }
   }
 
@@ -501,8 +502,9 @@ class MklConv2DCustomBackpropInputOp : public OpKernel {
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecutePrimitive(
-      const convolution_backward_data::primitive_desc& conv_pd,
-      MklDnnData<T>* filter, MklDnnData<T>* obp, MklDnnData<T>* output) {
+                  const convolution_backward_data::primitive_desc& conv_pd,
+                  MklDnnData<T>* filter, MklDnnData<T>* obp,
+                  MklDnnData<T>* output) {
     // Create reorders between user layout and MKL layout if it is needed and
     // add it to the net before convolution.
     std::vector<primitive> net;
@@ -512,11 +514,11 @@ class MklConv2DCustomBackpropInputOp : public OpKernel {
     // Memory for output of convolution. Since we may need reorder on the
     // output side, we will prepare reorder primitive in case output
     // reorder to user memory is required.
-    bool output_reorder_required =
-        output->PrepareReorderToUserMemIfReq(conv_pd.diff_src_primitive_desc());
+    bool output_reorder_required = output->PrepareReorderToUserMemIfReq(
+                                      conv_pd.diff_src_primitive_desc());
 
-    net.push_back(convolution_backward_data(
-        conv_pd, obp->GetOpMem(), filter->GetOpMem(), output->GetOpMem()));
+    net.push_back(convolution_backward_data(conv_pd, obp->GetOpMem(),
+                                    filter->GetOpMem(), output->GetOpMem()));
 
     // Insert reorder primitive in the net for output reorder if reorder is
     // required.
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 369f632fb4..a9872b8d6d 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <string.h>
 #include <map>
-#include <string>
 #include <vector>
+#include <string>
 
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -46,11 +46,11 @@ limitations under the License.
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
 
-using mkldnn::prop_kind;
 using mkldnn::stream;
+using mkldnn::prop_kind;
 
-using mkldnn::convolution_direct;
 using mkldnn::convolution_forward;
+using mkldnn::convolution_direct;
 #endif
 
 namespace tensorflow {
@@ -523,16 +523,19 @@ class MklConv2DOp : public OpKernel {
 
       // Get shapes of input tensors in MKL-DNN order
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
-      conv_utl.GetConvFwdSizesInMklOrder(
-          src_tensor.shape(), filter_tensor.shape(), &src_dims, &filter_dims,
-          &strides, &output_dims_tf_order, &output_dims_mkl_order, &padding_l,
-          &padding_r);
+      conv_utl.GetConvFwdSizesInMklOrder(src_tensor.shape(),
+                                         filter_tensor.shape(),
+                                         &src_dims, &filter_dims, &strides,
+                                         &output_dims_tf_order,
+                                         &output_dims_mkl_order, &padding_l,
+                                         &padding_r);
       if (!context->status().ok()) return;
 
       // Check for corner case - if there is nothing to compute, return.
-      TensorShape tf_output_shape(
-          {output_dims_tf_order[0], output_dims_tf_order[1],
-           output_dims_tf_order[2], output_dims_tf_order[3]});
+      TensorShape tf_output_shape({output_dims_tf_order[0],
+                                output_dims_tf_order[1],
+                                output_dims_tf_order[2],
+                                output_dims_tf_order[3]});
       Tensor* output_tensor = nullptr;
       MklShape mkl_output_mkl_shape;
       mkl_output_mkl_shape.SetMklTensor(false);
@@ -569,13 +572,13 @@ class MklConv2DOp : public OpKernel {
       // the layout is Tensorflow's layout (NHWC or NCHW depending on data
       // format).
       src.SetUsrMem(src_dims, TFDataFormatToMklDnnDataFormat(data_format_),
-                    const_cast<void*>(
-                        static_cast<const void*>(src_tensor.flat<T>().data())));
+                    const_cast<void*>(static_cast<const void*>(
+                    src_tensor.flat<T>().data())));
       // Although filter shape (filter_dims) required is in MKL-DNN order,
       // the layout is Tensorflow's layout (HWIO).
       filter.SetUsrMem(filter_dims, memory::format::hwio,
                        const_cast<void*>(static_cast<const void*>(
-                           filter_tensor.flat<T>().data())));
+                       filter_tensor.flat<T>().data())));
       // Although output shape (output_dims) required is in MKL-DNN order,
       // layout is Tensorflow's layout (NHWC or NCHW depending on data format).
       output.SetUsrMem(output_dims_mkl_order,
@@ -595,36 +598,36 @@ class MklConv2DOp : public OpKernel {
         const Tensor& bias_tensor = MklGetInput(context, 2);
         bias.SetUsrMem(bias_size, memory::format::x,
                        const_cast<void*>(static_cast<const void*>(
-                           bias_tensor.flat<T>().data())));
+                       bias_tensor.flat<T>().data())));
         bias.SetOpMemDesc(bias_size, memory::format::any);
 
         // Create convolution primitive with Bias.
-        auto conv_desc = convolution_forward::desc(
-            prop_kind::forward, convolution_direct, src.GetOpMemDesc(),
-            filter.GetOpMemDesc(), bias.GetOpMemDesc(), output.GetOpMemDesc(),
-            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
+        auto conv_desc = convolution_forward::desc(prop_kind::forward,
+            convolution_direct, src.GetOpMemDesc(), filter.GetOpMemDesc(),
+            bias.GetOpMemDesc(), output.GetOpMemDesc(), strides,
+            padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
 
-        auto conv_prim_desc =
-            convolution_forward::primitive_desc(conv_desc, cpu_engine);
+        auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
+                                                                cpu_engine);
         PrepareAndExecuteNet(conv_prim_desc, &src, &filter, &bias, &output);
       } else {
         // Create convolution primitive without Bias.
-        auto conv_desc = convolution_forward::desc(
-            prop_kind::forward, convolution_direct, src.GetOpMemDesc(),
-            filter.GetOpMemDesc(), output.GetOpMemDesc(), strides, padding_l,
-            padding_r, TFPaddingToMklDnnPadding(padding_));
+        auto conv_desc = convolution_forward::desc(prop_kind::forward,
+            convolution_direct, src.GetOpMemDesc(), filter.GetOpMemDesc(),
+            output.GetOpMemDesc(), strides, padding_l, padding_r,
+            TFPaddingToMklDnnPadding(padding_));
 
-        auto conv_prim_desc =
-            convolution_forward::primitive_desc(conv_desc, cpu_engine);
+        auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
+                                                                cpu_engine);
         PrepareAndExecuteNet(conv_prim_desc, &src, &filter, nullptr, &output);
       }
-    } catch (mkldnn::error& e) {
+    } catch (mkldnn::error &e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + std::string(e.message) + ", in file " +
-                         std::string(__FILE__) + ":" + std::to_string(__LINE__);
-      OP_REQUIRES_OK(
-          context,
-          errors::Aborted("Operation received an exception:", error_msg));
+                       ", message: " + std::string(e.message) +
+                       ", in file " + std::string(__FILE__) + ":" +
+                       std::to_string(__LINE__);
+      OP_REQUIRES_OK(context,
+        errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
@@ -635,9 +638,9 @@ class MklConv2DOp : public OpKernel {
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecuteNet(
-      const convolution_forward::primitive_desc& conv_prim_desc,
-      MklDnnData<T>* src, MklDnnData<T>* filter, MklDnnData<T>* bias,
-      MklDnnData<T>* output) {
+                  const convolution_forward::primitive_desc& conv_prim_desc,
+                  MklDnnData<T>* src, MklDnnData<T>* filter,
+                  MklDnnData<T>* bias, MklDnnData<T>* output) {
     // Create reorders between user layout and MKL layout if it is needed and
     // add it to the net before convolution.
     std::vector<primitive> net;
@@ -648,19 +651,18 @@ class MklConv2DOp : public OpKernel {
     // output side, we will prepare reorder primitive in case output
     // reorder to user memory is required.
     bool output_reorder_required = output->PrepareReorderToUserMemIfReq(
-        conv_prim_desc.dst_primitive_desc());
+                                      conv_prim_desc.dst_primitive_desc());
 
     // Create convolution primitive and add it to net.
     if (bias) {
       CHECK_EQ(biasEnabled, true);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
-                                        filter->GetOpMem(), bias->GetOpMem(),
-                                        output->GetOpMem()));
+                                    filter->GetOpMem(), bias->GetOpMem(),
+                                    output->GetOpMem()));
     } else {
       CHECK_EQ(biasEnabled, false);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
-                                        filter->GetOpMem(),
-                                        output->GetOpMem()));
+                                    filter->GetOpMem(), output->GetOpMem()));
     }
 
     // Insert reorder primitive in the net for output reorder if reorder is
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index e29af19ca9..f0cb37f8a4 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
 
-#include <limits>
 #include <vector>
+#include <limits>
 
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -26,8 +26,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -49,15 +49,15 @@ namespace tensorflow {
 
 class MklDnnConvUtil {
  protected:
-  OpKernelContext *context_;  // We don't own this.
+  OpKernelContext* context_;  // We don't own this.
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
 
  public:
-  MklDnnConvUtil(OpKernelContext *context, const std::vector<int32> &strides,
-                 Padding pad, TensorFormat fm)
-      : context_(context), strides_(strides), padding_(pad), data_format_(fm) {}
+  MklDnnConvUtil(OpKernelContext* context, const std::vector<int32>& strides,
+                 Padding pad, TensorFormat fm) : context_(context),
+    strides_(strides), padding_(pad), data_format_(fm) {}
 
   virtual ~MklDnnConvUtil() { context_ = nullptr; }
 
@@ -75,14 +75,14 @@ class MklDnnConvUtil {
   // requires input in NCHW format. Function does not return anything.
   // But errors arising from sanity checks are returned in context's
   // status.
-  virtual inline void GetInputSizeInMklOrder(const TensorShape &input_shape,
-                                             memory::dims *input_dims) {
-#define CHECK_BOUNDS(val, err_msg)                                     \
-  do {                                                                 \
-    OP_REQUIRES(context_,                                              \
-                FastBoundsCheck(val, std::numeric_limits<int>::max()), \
-                errors::InvalidArgument(err_msg));                     \
-  } while (0)
+  virtual inline void
+  GetInputSizeInMklOrder(const TensorShape& input_shape,
+                         memory::dims *input_dims) {
+  #define CHECK_BOUNDS(val, err_msg) do {                     \
+    OP_REQUIRES(context_, FastBoundsCheck(val,                \
+                            std::numeric_limits<int>::max()), \
+                errors::InvalidArgument(err_msg));            \
+  }while(0)
 
     CHECK_NOTNULL(input_dims);
 
@@ -105,7 +105,7 @@ class MklDnnConvUtil {
     CHECK_BOUNDS(input_batch_raw, "Input batch too large");
     int input_batch = static_cast<int>(input_batch_raw);
 
-#undef CHECK_BOUNDS
+  #undef CHECK_BOUNDS
 
     // MKL-DNN always requires input in NCHW format.
     *input_dims = {input_batch, input_depth, input_rows, input_cols};
@@ -125,9 +125,10 @@ class MklDnnConvUtil {
   // forward gets actual tensor as input).
   //
   // TODO(nhasabni): Add similar function for input and filter in MklShape.
-  virtual inline void GetFilterSizeInMklOrder(const TensorShape &input_shape,
-                                              const TensorShape &filter_shape,
-                                              memory::dims *filter_dims) {
+  virtual inline void
+  GetFilterSizeInMklOrder(const TensorShape& input_shape,
+                          const TensorShape& filter_shape,
+                          memory::dims *filter_dims) {
     CHECK_NOTNULL(filter_dims);
 
     OP_REQUIRES(context_, filter_shape.dims() == 4,
@@ -135,18 +136,17 @@ class MklDnnConvUtil {
                                         filter_shape.DebugString()));
 
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context_,
-                  FastBoundsCheck(filter_shape.dim_size(i),
-                                  std::numeric_limits<int>::max()),
-                  errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(context_, FastBoundsCheck(filter_shape.dim_size(i),
+                                           std::numeric_limits<int>::max()),
+                errors::InvalidArgument("filter too large"));
     }
 
     int input_depth = GetTensorDim(input_shape, data_format_, 'C');
 
-    OP_REQUIRES(context_, input_depth == filter_shape.dim_size(2),
-                errors::InvalidArgument(
-                    "input and filter must have the same depth: ", input_depth,
-                    " vs ", filter_shape.dim_size(2)));
+    OP_REQUIRES(
+        context_, input_depth == filter_shape.dim_size(2),
+        errors::InvalidArgument("input and filter must have the same depth: ",
+                                input_depth, " vs ", filter_shape.dim_size(2)));
 
     // TF filter is always in (rows, cols, in_depth, out_depth) order.
     int filter_rows = static_cast<int>(filter_shape.dim_size(0));
@@ -163,25 +163,25 @@ class MklDnnConvUtil {
   // requires filter in OIHW format. Function does not return anything.
   // But errors arising from sanity checks are returned in context's
   // status.
-  virtual inline void GetFilterSizeInMklOrder(size_t src_index,
-                                              size_t filter_index,
-                                              memory::dims *filter_dims) {
+  virtual inline void
+  GetFilterSizeInMklOrder(size_t src_index, size_t filter_index,
+                          memory::dims *filter_dims) {
     CHECK_NOTNULL(filter_dims);
-    const Tensor &input = MklGetInput(context_, src_index);
-    const Tensor &filter = MklGetInput(context_, filter_index);
+    const Tensor& input = MklGetInput(context_, src_index);
+    const Tensor& filter = MklGetInput(context_, filter_index);
     GetFilterSizeInMklOrder(input.shape(), filter.shape(), filter_dims);
   }
 
   // Calculate Bias size for 2D Convolution. Function does not return
   // anything, but sets error in context status.
-  virtual inline void GetBiasSizeInMklOrder(size_t bias_index,
-                                            memory::dims *bias_dims) {
-    const Tensor &bias = MklGetInput(context_, bias_index);
+  virtual inline void
+  GetBiasSizeInMklOrder(size_t bias_index, memory::dims *bias_dims) {
+    const Tensor& bias = MklGetInput(context_, bias_index);
     OP_REQUIRES(context_, bias.dims() == 1,
                 errors::InvalidArgument("bias must be 1-dimensional: ",
                                         bias.shape().DebugString()));
 
-    *bias_dims = {static_cast<int>(bias.dim_size(0))};
+    *bias_dims = { static_cast<int>(bias.dim_size(0)) };
   }
 
   // Function to calculate output and padding size for 2D convolution.
@@ -193,11 +193,13 @@ class MklDnnConvUtil {
   // status is returned via context status.
   //
   // TODO(nhasabni): Add similar function for input and filter in MklShape.
-  virtual inline void GetOutputAndPadSizeInMklOrder(
-      const TensorShape &input_shape, const TensorShape &filter_shape,
-      const memory::dims &strides, memory::dims *output_dims_tf_order,
-      memory::dims *output_dims_mkl_order, memory::dims *pad_l,
-      memory::dims *pad_r) {
+  virtual inline void
+  GetOutputAndPadSizeInMklOrder(const TensorShape& input_shape,
+                                const TensorShape& filter_shape,
+                                const memory::dims& strides,
+                                memory::dims *output_dims_tf_order,
+                                memory::dims *output_dims_mkl_order,
+                                memory::dims *pad_l, memory::dims *pad_r) {
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
@@ -223,21 +225,21 @@ class MklDnnConvUtil {
     int64 out_rows = 0, out_cols = 0;
     int64 pad_top = 0, pad_bottom = 0, pad_left, pad_right;
 
-    OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
-                                 input_rows, filter_rows, stride_rows, padding_,
-                                 &out_rows, &pad_top, &pad_bottom));
-    OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
-                                 input_cols, filter_cols, stride_cols, padding_,
-                                 &out_cols, &pad_left, &pad_right));
+    OP_REQUIRES_OK(context_,
+            GetWindowedOutputSizeVerbose(input_rows, filter_rows, stride_rows,
+                                 padding_, &out_rows, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(context_,
+            GetWindowedOutputSizeVerbose(input_cols, filter_cols, stride_cols,
+                                 padding_, &out_cols, &pad_left, &pad_right));
 
     // Tensorflow output is in data_format order. (NHWC or NCHW)
-    TensorShape out_shape =
-        ShapeFromFormat(data_format_, out_batch, out_rows, out_cols, out_depth);
+    TensorShape out_shape = ShapeFromFormat(data_format_, out_batch,
+                                            out_rows, out_cols, out_depth);
     *output_dims_tf_order = TFShapeToMklDnnDims(out_shape);
 
     // MKL-DNN always needs output in NCHW format.
     *output_dims_mkl_order = {out_batch, out_depth, static_cast<int>(out_rows),
-                              static_cast<int>(out_cols)};
+                   static_cast<int>(out_cols)};
 
     // Now handle padding. MKL-DNN uses asymetric padding.
     *pad_l = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
@@ -248,25 +250,27 @@ class MklDnnConvUtil {
   // See comment on GetConvOutputAndPadSizeInMklOrder for parameters.
   //
   // Function does not return anything, but sets error in context status.
-  inline void GetOutputAndPadSizeInMklOrder(
-      size_t src_index, size_t filter_index, const memory::dims &strides,
-      memory::dims *output_dims_tf_order, memory::dims *output_dims_mkl_order,
-      memory::dims *pad_l, memory::dims *pad_r) {
+  inline void
+  GetOutputAndPadSizeInMklOrder(size_t src_index, size_t filter_index,
+                                const memory::dims& strides,
+                                memory::dims *output_dims_tf_order,
+                                memory::dims *output_dims_mkl_order,
+                                memory::dims *pad_l, memory::dims *pad_r) {
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
     CHECK_NOTNULL(pad_r);
 
-    const Tensor &input = MklGetInput(context_, src_index);
-    const Tensor &filter = MklGetInput(context_, filter_index);
+    const Tensor& input = MklGetInput(context_, src_index);
+    const Tensor& filter = MklGetInput(context_, filter_index);
 
     OP_REQUIRES(context_, input.dims() == 4,
                 errors::InvalidArgument("input must be 4-dimensional",
-                                        input.shape().DebugString()));
+                                          input.shape().DebugString()));
 
-    GetOutputAndPadSizeInMklOrder(input.shape(), filter.shape(), strides,
-                                  output_dims_tf_order, output_dims_mkl_order,
-                                  pad_l, pad_r);
+    GetOutputAndPadSizeInMklOrder(input.shape(), filter.shape(),
+                                  strides, output_dims_tf_order,
+                                  output_dims_mkl_order, pad_l, pad_r);
   }
 
   // Wrapper function to calculate input, filter, and output sizes of
@@ -275,12 +279,15 @@ class MklDnnConvUtil {
   // also calculates strides and paddings for 2D Convolution.
   //
   // Function does not return anything, but sets error in context status.
-  inline void GetConvFwdSizesInMklOrder(
-      const TensorShape &input_shape, const TensorShape &filter_shape,
-      memory::dims *input_dims, memory::dims *filter_dims,
-      memory::dims *strides, memory::dims *output_dims_tf_order,
-      memory::dims *output_dims_mkl_order, memory::dims *pad_l,
-      memory::dims *pad_r) {
+  inline void GetConvFwdSizesInMklOrder(const TensorShape& input_shape,
+                                        const TensorShape& filter_shape,
+                                        memory::dims *input_dims,
+                                        memory::dims *filter_dims,
+                                        memory::dims *strides,
+                                        memory::dims *output_dims_tf_order,
+                                        memory::dims *output_dims_mkl_order,
+                                        memory::dims *pad_l,
+                                        memory::dims *pad_r) {
     CHECK_NOTNULL(input_dims);
     CHECK_NOTNULL(filter_dims);
     CHECK_NOTNULL(strides);
@@ -295,7 +302,8 @@ class MklDnnConvUtil {
     if (!context_->status().ok()) return;
     GetStridesInMklOrder(strides);
     GetOutputAndPadSizeInMklOrder(input_shape, filter_shape, *strides,
-                                  output_dims_tf_order, output_dims_mkl_order,
+                                  output_dims_tf_order,
+                                  output_dims_mkl_order,
                                   pad_l, pad_r);
     if (!context_->status().ok()) return;
   }
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.cc b/tensorflow/core/kernels/mkl_tfconv_op.cc
deleted file mode 100644
index b48c735d12..0000000000
--- a/tensorflow/core/kernels/mkl_tfconv_op.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef INTEL_MKL
-
-#include <algorithm>
-#include <vector>
-#include "tensorflow/core/framework/numeric_op.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/platform/cpu_info.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/util/tensor_format.h"
-
-#include "tensorflow/core/util/mkl_util.h"
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-
-namespace tensorflow {
-typedef Eigen::ThreadPoolDevice CPUDevice;
-
-///////////////////////////////////////////////////////////
-//               Op kernel
-///////////////////////////////////////////////////////////
-
-template <typename Device, typename T>
-class MklToTfOp : public OpKernel {
- public:
-  explicit MklToTfOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
-    OP_REQUIRES_OK(context, context->GetAttr("T", &op_data_type));
-    has_avx512f_ = port::TestCPUFeature(port::CPUFeature::AVX512F);
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // Check that input tensor is in MKL format.
-    const Tensor& input_tensor = MklGetInput(context, 0);
-    MklShape input_shape;
-    GetMklShape(context, 0, &input_shape);
-
-    // if input is already in Tf format, then just copy input tensor to output.
-    if (!input_shape.IsMklTensor()) {
-      context->set_output(0, input_tensor);
-      VLOG(1) << "MKLToTFConversion: No conversion needed, "
-              << "copying input to output";
-      return;
-    }
-
-    // Check that input data type is same as operator data type and that it is
-    // same as output data type.
-    DataType input_data_type = input_type(0);
-    DataType output_data_type = output_type(0);
-    CHECK_EQ(op_data_type, input_data_type);
-    CHECK_EQ(op_data_type, output_data_type);
-
-    TensorShape output_shape;
-    size_t ndims = input_shape.GetDimension();
-    size_t* in_sizes = new size_t[ndims];
-    for (size_t i = 0; i < ndims; i++) {
-      // Outermost to innermost dimension
-      output_shape.AddDim(input_shape.GetSizes()[input_shape.tf_dim_idx(i)]);
-      in_sizes[i] = input_shape.GetSizes()[i];
-    }
-
-    // Allocate output tensor.
-    Tensor* output_tensor = NULL;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, output_shape, &output_tensor));
-
-    dnnLayout_t output_layout =
-        static_cast<dnnLayout_t>(input_shape.GetTfLayout());
-    // Execute DNNConversion.
-    void* input_buffer =
-        static_cast<void*>(const_cast<T*>(input_tensor.flat<T>().data()));
-    delete[] in_sizes;
-    void* output_buffer =
-        static_cast<void*>(const_cast<T*>(output_tensor->flat<T>().data()));
-    input_shape.GetConvertedFlatData(output_layout, input_buffer,
-                                     output_buffer);
-    VLOG(1) << "MKLToTFConversion complete successfully.";
-  }
-
- private:
-  /// Data format of the operation
-  string data_format_str;
-
-  /// Data type of the operation
-  DataType op_data_type;
-
-  /// CPUIDInfo
-  bool has_avx512f_ = false;
-};
-
-///////////////////////////////////////////////////////////
-//               Register kernel
-///////////////////////////////////////////////////////////
-
-#define REGISTER_CPU(T)                                             \
-  REGISTER_KERNEL_BUILDER(Name("_MklToTf")                          \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-                          MklToTfOp<CPUDevice, T>);
-
-TF_CALL_float(REGISTER_CPU);
-#undef REGISTER_CPU
-}  // namespace tensorflow
-#endif /* INTEL_MKL */
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl_tfconv_op.h
index a240ee44fb..0a5be4fec9 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.h
+++ b/tensorflow/core/kernels/mkl_tfconv_op.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifdef INTEL_MKL
-
 #ifndef TENSORFLOW_CORE_KERNELS_MKL_TFCONV_OP_H_
 #define TENSORFLOW_CORE_KERNELS_MKL_TFCONV_OP_H_
 
+#ifdef INTEL_MKL
+
 #include <algorithm>
 #include <vector>
 #include "tensorflow/core/framework/numeric_op.h"
@@ -35,6 +35,10 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
+#ifdef INTEL_MKL_DNN
+using mkldnn::stream;
+#endif
+
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
@@ -57,6 +61,71 @@ class MklToTfOp : public OpKernel {
     VLOG(1) << "MKLToTFConversion complete successfully.";
   }
 
+#ifdef INTEL_MKL_DNN
+  static void ConvertMklToTf(OpKernel* op_kernel, OpKernelContext* context,
+                             string data_format_str, DataType op_data_type,
+                             bool has_avx512f, uint input_number) {
+    try {
+      // Check that input tensor is in MKL format.
+      const Tensor& input_tensor = MklGetInput(context, input_number);
+      MklDnnShape input_shape;
+      GetMklShape(context, input_number, &input_shape);
+
+      // if input is already in Tf format, then copy input tensor to output.
+      if (!input_shape.IsMklTensor()) {
+        context->set_output(input_number, input_tensor);
+        VLOG(1) << "MKLToTFConversion: No conversion needed, "
+                << "copying input to output";
+        return;
+      }
+
+      // Check that input data type is same as operator data type and that it
+      // is same as output data type.
+      DataType input_data_type = op_kernel->input_type(input_number);
+      DataType output_data_type = op_kernel->output_type(input_number);
+      CHECK_EQ(op_data_type, input_data_type);
+      CHECK_EQ(op_data_type, output_data_type);
+
+      auto cpu_engine = engine(engine::cpu, 0);
+      MklDnnData<T> input(&cpu_engine);
+
+      // Get Mkl layout of input tensor.
+      auto input_mkl_md = input_shape.GetMklLayout();
+      // Get TensorFlow layout of input tensor. Expected output of conversion
+      // has same layout as Tensorflow layout of input tensor.
+      auto output_tf_md = input_shape.GetTfLayout();
+      auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine);
+      // Set input Mkl layout as the user layout.
+      input.SetUsrMem(input_mkl_md, &input_tensor);
+
+      // Allocate output tensor.
+      TensorShape output_shape = input_shape.GetTfShape();
+      Tensor* output_tensor = NULL;
+      OP_REQUIRES_OK(context, context->allocate_output(input_number,
+                                  output_shape, &output_tensor));
+      CHECK_NOTNULL(output_tensor);
+
+      // Do we need to reorder Mkl layout into TensorFlow layout?
+      if (input.IsReorderNeeded(output_tf_pd)) {
+        // Insert reorder between Mkl layout and TensorFlow layout.
+        std::vector<primitive> net;
+        CHECK_EQ(input.CheckReorderToOpMem(output_tf_pd, output_tensor, &net),
+                 true);
+        stream(stream::kind::eager).submit(net).wait();
+      } else {
+        // If not, just forward input tensor to output tensor.
+        CHECK(output_tensor->CopyFrom(input_tensor, output_shape));
+      }
+    } catch (mkldnn::error &e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                       ", message: " + std::string(e.message) +
+                       ", in file " + std::string(__FILE__) + ":" +
+                       std::to_string(__LINE__);
+      OP_REQUIRES_OK(context,
+        errors::Aborted("Operation received an exception:", error_msg));
+    }
+  }
+#else
   static void ConvertMklToTf(OpKernel* op_kernel, OpKernelContext* context,
                              string data_format_str, DataType op_data_type,
                              bool has_avx512f, uint input_number) {
@@ -91,8 +160,8 @@ class MklToTfOp : public OpKernel {
 
     // Allocate output tensor.
     Tensor* output_tensor = NULL;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(input_number, output_shape, &output_tensor));
+    OP_REQUIRES_OK(context, context->allocate_output(input_number,
+                              output_shape, &output_tensor));
 
     dnnLayout_t output_layout =
         static_cast<dnnLayout_t>(input_shape.GetTfLayout());
@@ -106,6 +175,7 @@ class MklToTfOp : public OpKernel {
                                      output_buffer);
     VLOG(1) << "MKLToTFConversion complete successfully.";
   }
+#endif
 
  private:
   /// Data format of the operation
@@ -132,5 +202,5 @@ class MklToTfOp : public OpKernel {
 TF_CALL_NUMBER_TYPES(REGISTER_CPU);
 #undef REGISTER_CPU
 }  // namespace tensorflow
-#endif  // TENSORFLOW_CORE_KERNELS_MKL_TFCONV_OP_H_
 #endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_TFCONV_OP_H_
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index 7dee751c4f..ac90f67ce0 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -143,7 +143,7 @@ void DnnPoolingOp<T>::Compute(
     perftools::gputools::dnn::PoolingMode pooling_mode,
     const std::vector<int32>& size, const std::vector<int32>& stride,
     Padding padding, TensorFormat data_format, const Tensor& tensor_in,
-    const TensorShape& tensor_out_shape) {
+    const TensorShape& tensor_out_shape, bool propagate_nans) {
   Tensor* tensor_out = nullptr;
   OP_REQUIRES_OK(context,
                  context->allocate_output(0, tensor_out_shape, &tensor_out));
@@ -188,7 +188,8 @@ void DnnPoolingOp<T>::Compute(
       .set_vertical_stride(params.row_stride)
       .set_horizontal_stride(params.col_stride)
       .set_vertical_padding(params.pad_rows)
-      .set_horizontal_padding(params.pad_cols);
+      .set_horizontal_padding(params.pad_cols)
+      .set_propagate_nans(propagate_nans);
 
   perftools::gputools::dnn::BatchDescriptor input_desc;
   input_desc.set_count(params.tensor_in_batch)
@@ -237,7 +238,7 @@ void DnnPoolingGradOp<T>::Compute(
     const std::vector<int32>& size, const std::vector<int32>& stride,
     Padding padding, TensorFormat data_format, const Tensor* tensor_in,
     const Tensor* tensor_out, const Tensor& out_backprop,
-    const TensorShape& tensor_in_shape) {
+    const TensorShape& tensor_in_shape, bool propagate_nans) {
   CHECK((pooling_mode != perftools::gputools::dnn::PoolingMode::kMaximum) ||
         (tensor_in && tensor_out))
       << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
@@ -327,7 +328,8 @@ void DnnPoolingGradOp<T>::Compute(
       .set_vertical_stride(params.row_stride)
       .set_horizontal_stride(params.col_stride)
       .set_vertical_padding(params.pad_rows)
-      .set_horizontal_padding(params.pad_cols);
+      .set_horizontal_padding(params.pad_cols)
+      .set_propagate_nans(propagate_nans);
 
   perftools::gputools::dnn::BatchDescriptor orig_output_desc;
   orig_output_desc.set_count(params.tensor_in_batch)
diff --git a/tensorflow/core/kernels/pooling_ops_common_gpu.h b/tensorflow/core/kernels/pooling_ops_common_gpu.h
index b594f39fad..1458456585 100644
--- a/tensorflow/core/kernels/pooling_ops_common_gpu.h
+++ b/tensorflow/core/kernels/pooling_ops_common_gpu.h
@@ -44,7 +44,7 @@ class DnnPoolingOp {
                       const std::vector<int32>& size,
                       const std::vector<int32>& stride, Padding padding,
                       TensorFormat data_format, const Tensor& tensor_in,
-                      const TensorShape& tensor_out_shape);
+                      const TensorShape& tensor_out_shape, bool propagate_nans);
 };
 
 // A helper class that launch the cudnn pooling backward operations.
@@ -60,7 +60,7 @@ class DnnPoolingGradOp {
                       const std::vector<int32>& stride, Padding padding,
                       TensorFormat data_format, const Tensor* tensor_in,
                       const Tensor* tensor_out, const Tensor& out_backprop,
-                      const TensorShape& tensor_in_shape);
+                      const TensorShape& tensor_in_shape, bool propagate_nans);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/quantized_add_op.cc b/tensorflow/core/kernels/quantized_add_op.cc
index 8be0c56798..337c8e5c17 100644
--- a/tensorflow/core/kernels/quantized_add_op.cc
+++ b/tensorflow/core/kernels/quantized_add_op.cc
@@ -489,7 +489,7 @@ class QuantizedAddOp : public OpKernel {
     // adding zero leaves the result unchanged, and to contain the largest of
     // the two input values with some room to spare.
     const float smallest_min = std::min(min_x, min_y);
-    const float largest_max = std::min(max_x, max_y);
+    const float largest_max = std::max(max_x, max_y);
     const float biggest_range =
         std::max(std::abs(smallest_min), std::abs(largest_max));
     const float output_range = (biggest_range * (1 << 14));
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
index a37c757865..55a8b9c9b6 100644
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@@ -577,7 +577,7 @@ struct FillPhiloxRandomKernel<Distribution, false> {
     const size_t kGroupSize = Distribution::kResultElementCount;
 
     const size_t item_id = item.get_global(0);
-    const size_t total_item_count = item.get_global_range(0);
+    const size_t total_item_count = item.get_global_range();
     size_t offset = item_id * kGroupSize;
     gen_.Skip(item_id);
 
@@ -633,7 +633,7 @@ struct FillPhiloxRandomKernel<Distribution, true> {
                                                 PhiloxRandom::kResultElementCount;
 
     const size_t item_id = item.get_global(0);
-    const size_t total_item_count = item.get_global_range(0);
+    const size_t total_item_count = item.get_global_range();
     size_t group_index = item_id;
     size_t offset = group_index * kGroupSize;
 
diff --git a/tensorflow/core/kernels/range_dataset_op.cc b/tensorflow/core/kernels/range_dataset_op.cc
index e7ae840fc7..7adfcc4f8d 100644
--- a/tensorflow/core/kernels/range_dataset_op.cc
+++ b/tensorflow/core/kernels/range_dataset_op.cc
@@ -99,6 +99,7 @@ class RangeDatasetOp : public DatasetOpKernel {
         if ((dataset()->step_ > 0 && next_ >= dataset()->stop_) ||
             (dataset()->step_ < 0 && next_ <= dataset()->stop_)) {
           *end_of_sequence = true;
+          is_exhausted_ = true;
           return Status::OK();
         }
         Tensor value_tensor(cpu_allocator(), DT_INT64, {});
diff --git a/tensorflow/core/kernels/reader_dataset_ops.cc b/tensorflow/core/kernels/reader_dataset_ops.cc
index c08e42be1d..39ef92a5de 100644
--- a/tensorflow/core/kernels/reader_dataset_ops.cc
+++ b/tensorflow/core/kernels/reader_dataset_ops.cc
@@ -402,6 +402,7 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
           // Iteration ends when there are no more files to process.
           if (current_file_index_ == dataset()->filenames_.size()) {
             *end_of_sequence = true;
+            is_exhausted_ = true;
             return Status::OK();
           }
 
diff --git a/tensorflow/core/kernels/repeat_dataset_op.cc b/tensorflow/core/kernels/repeat_dataset_op.cc
index 0167b9ea64..9813e99a70 100644
--- a/tensorflow/core/kernels/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/repeat_dataset_op.cc
@@ -95,15 +95,6 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
         *end_of_sequence = true;
         return Status::OK();
       }
-
-     protected:
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        return Status::OK();
-      }
-      Status RestoreInternal(OpKernelContext* ctx,
-                             IteratorStateReader* reader) override {
-        return Status::OK();
-      }
     };
 
     class FiniteIterator : public DatasetIterator<Dataset> {
@@ -117,10 +108,6 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
-        if (!input_impl_) {
-          *end_of_sequence = true;
-          return Status::OK();
-        }
         while (i_ < dataset()->count_) {
           TF_RETURN_IF_ERROR(
               input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
@@ -131,6 +118,7 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
           input_impl_ = dataset()->input_->MakeIterator(prefix());
         }
         *end_of_sequence = true;
+        is_exhausted_ = true;
         input_impl_.reset();
         return Status::OK();
       }
@@ -139,12 +127,7 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("i"), i_));
-        if (!input_impl_) {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("input_impl_empty"), ""));
-        } else {
-          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
-        }
+        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
         return Status::OK();
       }
 
@@ -152,11 +135,7 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("i"), &i_));
-        if (!reader->Contains(full_name("input_impl_empty"))) {
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
-        } else {
-          input_impl_.reset();
-        }
+        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
         return Status::OK();
       }
 
@@ -204,29 +183,6 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
         } while (true);
       }
 
-     protected:
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        mutex_lock l(mu_);
-        if (input_impl_)
-          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
-        else
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("uninitialized"), ""));
-        return Status::OK();
-      }
-
-      Status RestoreInternal(OpKernelContext* ctx,
-                             IteratorStateReader* reader) override {
-        mutex_lock l(mu_);
-        if (reader->Contains(full_name("uninitialized"))) {
-          input_impl_.reset();
-        } else {
-          input_impl_ = dataset()->input_->MakeIterator(prefix());
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
-        }
-        return Status::OK();
-      }
-
      private:
       mutex mu_;
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index 4302a68a18..2334e50f1d 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -376,6 +376,9 @@ struct UnsortedSegmentSumFunctor<CPUDevice, T, Index>
     auto data_flat = typename TTypes<T, 2>::ConstTensor(data, N, data_size / N);
     for (int64 i = 0; i < N; ++i) {
       Index j = internal::SubtleMustCopy(segment_ids(i));
+      if (j < 0) {
+        continue;
+      }
       OP_REQUIRES(ctx, FastBoundsCheck(j, output_rows),
                   errors::InvalidArgument(
                       "segment_ids", SliceDebugString(segment_ids_shape, i),
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index 412c1d601d..b10bea72ba 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -30,14 +30,14 @@ namespace functor {
 #ifdef GOOGLE_CUDA
 typedef Eigen::GpuDevice GPUDevice;
 // Functor for SegmentSumGPUOp.
-// 'output_rows': the number of output segments (unique segment ids in
+// output_rows: the number of output segments (unique segment ids in
 //                'segment_ids').
-// 'segment_ids_shape': shape of 'segment_ids' tensor.
-// 'segment_ids': unsorted map from input to output segment ids at which to
+// segment_ids_shape: shape of 'segment_ids' tensor.
+// segment_ids: unsorted map from input to output segment ids at which to
 //                perform segment sum operation.
-// 'data_size': size of input data tensor.
-// 'data': input data tensor.
-// 'output': output reshaped to {output_rows, output.size/output_rows}
+// data_size: size of input data tensor.
+// data: input data tensor.
+// output: output reshaped to {output_rows, output.size/output_rows}
 template <typename T, typename Index>
 struct SegmentSumFunctor {
   void operator()(OpKernelContext* ctx, const GPUDevice& d,
@@ -61,14 +61,14 @@ struct UnsortedSegmentBaseFunctor{
 };
 
 // Functor for UnsortedSegmentSumOp.
-// 'output_rows': the number of output segments (unique segment ids in
+// output_rows: the number of output segments (unique segment ids in
 //                'segment_ids').
-// 'segment_ids_shape': shape of 'segment_ids' tensor.
-// 'segment_ids': unsorted map from input to output segment ids at which to
+// segment_ids_shape: shape of 'segment_ids' tensor.
+// segment_ids: unsorted map from input to output segment ids at which to
 //                perform segment sum operation.
-// 'data_size': size of input data tensor.
-// 'data': input data tensor.
-// 'output': output reshaped to {output_rows, output.size/output_rows}
+// data_size: size of input data tensor.
+// data: input data tensor.
+// output: output reshaped to {output_rows, output.size/output_rows}
 template <typename Device, typename T, typename Index>
 struct UnsortedSegmentSumFunctor: public UnsortedSegmentBaseFunctor<Device, T, Index> {
   void operator()(OpKernelContext* ctx, const Device& d,
@@ -79,14 +79,14 @@ struct UnsortedSegmentSumFunctor: public UnsortedSegmentBaseFunctor<Device, T, I
 };
 
 // Functor for UnsortedSegmentMaxOp.
-// 'output_rows': the number of output segments (unique segment ids in
+// output_rows: the number of output segments (unique segment ids in
 //                'segment_ids').
-// 'segment_ids_shape': shape of 'segment_ids' tensor.
-// 'segment_ids': unsorted map from input to output segment ids at which to
+// segment_ids_shape: shape of 'segment_ids' tensor.
+// segment_ids: unsorted map from input to output segment ids at which to
 //                perform segment sum operation.
-// 'data_size': size of input data tensor.
-// 'data': input data tensor.
-// 'output': output reshaped to {output_rows, output.size/output_rows}
+// data_size: size of input data tensor.
+// data: input data tensor.
+// output: output reshaped to {output_rows, output.size/output_rows}
 template <typename Device, typename T, typename Index>
 struct UnsortedSegmentMaxFunctor: public UnsortedSegmentBaseFunctor<Device, T, Index> {
   void operator()(OpKernelContext* ctx, const Device& d,
diff --git a/tensorflow/core/kernels/shape_ops.cc b/tensorflow/core/kernels/shape_ops.cc
index 721f9b949b..28a39bae3f 100644
--- a/tensorflow/core/kernels/shape_ops.cc
+++ b/tensorflow/core/kernels/shape_ops.cc
@@ -341,7 +341,12 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                             .Device(DEVICE_CPU)
                             .HostMemory("dim")
                             .TypeConstraint<int32>("Tdim"),
-                        ExpandDimsOp);
+                        ExpandDimsOp<int32>);
+REGISTER_KERNEL_BUILDER(Name("ExpandDims")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("dim")
+                            .TypeConstraint<int64>("Tdim"),
+                        ExpandDimsOp<int64>);
 
 #if GOOGLE_CUDA
 #define REGISTER_GPU_KERNEL(type)                            \
@@ -350,7 +355,13 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                               .TypeConstraint<type>("T")     \
                               .TypeConstraint<int32>("Tdim") \
                               .HostMemory("dim"),            \
-                          ExpandDimsOp);
+                          ExpandDimsOp<int32>);              \
+  REGISTER_KERNEL_BUILDER(Name("ExpandDims")                 \
+                              .Device(DEVICE_GPU)            \
+                              .TypeConstraint<type>("T")     \
+                              .TypeConstraint<int64>("Tdim") \
+                              .HostMemory("dim"),            \
+                          ExpandDimsOp<int64>);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
@@ -362,7 +373,15 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                             .HostMemory("input")
                             .HostMemory("dim")
                             .HostMemory("output"),
-                        ExpandDimsOp);
+                        ExpandDimsOp<int32>);
+REGISTER_KERNEL_BUILDER(Name("ExpandDims")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int64>("Tdim")
+                            .HostMemory("input")
+                            .HostMemory("dim")
+                            .HostMemory("output"),
+                        ExpandDimsOp<int64>);
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
@@ -372,7 +391,13 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                               .TypeConstraint<type>("T")     \
                               .TypeConstraint<int32>("Tdim") \
                               .HostMemory("dim"),            \
-                          ExpandDimsOp);
+                          ExpandDimsOp<int32>);              \
+  REGISTER_KERNEL_BUILDER(Name("ExpandDims")                 \
+                              .Device(DEVICE_SYCL)           \
+                              .TypeConstraint<type>("T")     \
+                              .TypeConstraint<int64>("Tdim") \
+                              .HostMemory("dim"),            \
+                          ExpandDimsOp<int64>);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 TF_CALL_bool(REGISTER_SYCL_KERNEL);
 #undef REGISTER_SYCL_KERNEL
@@ -384,7 +409,15 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                             .HostMemory("input")
                             .HostMemory("dim")
                             .HostMemory("output"),
-                        ExpandDimsOp);
+                        ExpandDimsOp<int32>);
+REGISTER_KERNEL_BUILDER(Name("ExpandDims")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int64>("Tdim")
+                            .HostMemory("input")
+                            .HostMemory("dim")
+                            .HostMemory("output"),
+                        ExpandDimsOp<int64>);
 #endif  // TENSORFLOW_USE_SYCL
 
 // Squeeze ---------------------------------------
diff --git a/tensorflow/core/kernels/shape_ops.h b/tensorflow/core/kernels/shape_ops.h
index ac607f4e8b..8d9d0ea846 100644
--- a/tensorflow/core/kernels/shape_ops.h
+++ b/tensorflow/core/kernels/shape_ops.h
@@ -145,6 +145,7 @@ class SizeOp : public OpKernel {
   bool IsExpensive() override { return false; }
 };
 
+template <typename Tdim>
 class ExpandDimsOp : public OpKernel {
  public:
   explicit ExpandDimsOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
@@ -153,7 +154,7 @@ class ExpandDimsOp : public OpKernel {
     OP_REQUIRES(ctx, ctx->input(0).dtype() != DT_VARIANT,
                 errors::InvalidArgument("ExpandDims on Variant not supported"));
 
-    int32 dim = ctx->input(1).flat<int32>()(0);
+    Tdim dim = ctx->input(1).flat<Tdim>()(0);
     OP_REQUIRES(
         ctx, (dim >= -1 - ctx->input(0).dims() && dim <= ctx->input(0).dims()),
         errors::InvalidArgument("Tried to expand dim index ", dim,
@@ -175,7 +176,7 @@ class ExpandDimsOp : public OpKernel {
     }
 
     // Clamp to the end if needed.
-    dim = std::min<int32>(dim, existing_dims_size);
+    dim = std::min<Tdim>(dim, existing_dims_size);
     new_shape.emplace(new_shape.begin() + dim, 1);
     const TensorShape output_shape(new_shape);
 
@@ -234,10 +235,10 @@ class SqueezeOp : public OpKernel {
       if (!wrapped_squeeze_dims.empty()) {
         if (wrapped_squeeze_dims.count(i) > 0) {
           OP_REQUIRES(ctx, existing_dim == 1,
-                      errors::InvalidArgument(
-                          "Tried to explicitly squeeze "
-                          "dimension ",
-                          i, " but dimension was not 1: ", existing_dim));
+                      errors::InvalidArgument("Tried to explicitly squeeze "
+                                              "dimension ",
+                                              i, " but dimension was not 1: ",
+                                              existing_dim));
         } else {
           // This dimension is not being squeezed.
           new_shape.push_back(existing_dim);
diff --git a/tensorflow/core/kernels/shuffle_dataset_op.cc b/tensorflow/core/kernels/shuffle_dataset_op.cc
index dd0ab57e9d..2146ba2aa1 100644
--- a/tensorflow/core/kernels/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/shuffle_dataset_op.cc
@@ -105,7 +105,8 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
         mutex_lock l(mu_);
         int64 start_micros = ctx->env()->NowMicros();
         int64 num_log_entries = 0;
-        while (input_impl_ && buffer_.size() < dataset()->buffer_size_) {
+        while (!end_of_input_sequence_ &&
+               buffer_.size() < dataset()->buffer_size_) {
           if (ctx->env()->NowMicros() >
               ((num_log_entries + 1) * kLogIntervalMicros) + start_micros) {
             num_log_entries++;
@@ -113,10 +114,9 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
                       << buffer_.size() << " of " << dataset()->buffer_size_;
           }
           std::vector<Tensor> input_element;
-          bool end_of_input_sequence;
           TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &input_element,
-                                                  &end_of_input_sequence));
-          if (!end_of_input_sequence) {
+                                                  &end_of_input_sequence_));
+          if (!end_of_input_sequence_) {
             buffer_.emplace_back(std::move(input_element));
           } else {
             input_impl_.reset();
@@ -135,7 +135,7 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
           std::swap(buffer_[index], buffer_.back());
           buffer_.pop_back();
         } else {
-          DCHECK(input_impl_ == nullptr);
+          DCHECK(end_of_input_sequence_);
           *end_of_sequence = true;
         }
         return Status::OK();
@@ -148,11 +148,11 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
         // Save the tensors in the buffer.
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(full_name("buffer_size"), buffer_.size()));
-        for (size_t i = 0; i < buffer_.size(); i++) {
+        for (int i = 0; i < buffer_.size(); i++) {
           TF_RETURN_IF_ERROR(writer->WriteScalar(
               full_name(strings::StrCat("buffer_", i, "_size")),
               buffer_[i].size()));
-          for (size_t j = 0; j < buffer_[i].size(); j++) {
+          for (int j = 0; j < buffer_[i].size(); j++) {
             TF_RETURN_IF_ERROR(writer->WriteTensor(
                 full_name(strings::StrCat("buffer_", i, "_", j)),
                 buffer_[i][j]));
@@ -165,7 +165,7 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
 
         // Save input iterator if it hasn't been exhausted else write
         // "end_of_input_sequence".
-        if (!input_impl_) {
+        if (end_of_input_sequence_) {
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("end_of_input_sequence"), ""));
         } else {
@@ -180,15 +180,10 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
         buffer_.clear();
 
         // Restore the buffer.
-        size_t buffer_size;
-        {
-          int64 temp;
-          TF_RETURN_IF_ERROR(
-              reader->ReadScalar(full_name("buffer_size"), &temp));
-          buffer_size = static_cast<size_t>(temp);
-        }
-        buffer_.reserve(buffer_size);
-        for (size_t i = 0; i < buffer_size; i++) {
+        int64 buffer_size;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("buffer_size"), &buffer_size));
+        for (int i = 0; i < buffer_size; i++) {
           int64 list_size;
           TF_RETURN_IF_ERROR(reader->ReadScalar(
               full_name(strings::StrCat("buffer_", i, "_size")), &list_size));
@@ -210,6 +205,7 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
           input_impl_ = dataset()->input_->MakeIterator(prefix());
           TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
         } else {
+          end_of_input_sequence_ = true;
           input_impl_.reset();
         }
         return Status::OK();
@@ -234,6 +230,7 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
       mutex mu_;
       std::vector<std::vector<Tensor>> buffer_ GUARDED_BY(mu_);
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      bool end_of_input_sequence_ GUARDED_BY(mu_) = false;
       const int64 seed_ GUARDED_BY(mu_);
       const int64 seed2_ GUARDED_BY(mu_);
       random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/skip_dataset_op.cc b/tensorflow/core/kernels/skip_dataset_op.cc
index 7ee945dd4c..52a6116a7c 100644
--- a/tensorflow/core/kernels/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/skip_dataset_op.cc
@@ -35,14 +35,14 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
     int64 count;
     OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "count", &count));
 
-    *output = new Dataset(ctx, count, input);
+    *output = new Dataset(count, input);
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
-    Dataset(OpKernelContext* ctx, int64 count, const DatasetBase* input)
-        : GraphDatasetBase(ctx), count_(count), input_(input) {
+    Dataset(int64 count, const DatasetBase* input)
+        : count_(count), input_(input) {
       input_->Ref();
     }
 
@@ -71,18 +71,6 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() override { return "SkipDatasetOp::Dataset"; }
 
-   protected:
-    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(input_, &input_graph_node));
-      Node* count = nullptr;
-      TF_RETURN_IF_ERROR(b->AddScalar(count_, &count));
-      TF_RETURN_IF_ERROR(
-          b->AddDataset(this, {input_graph_node, count}, output));
-      return Status::OK();
-    }
-
    private:
     class EmptyIterator : public DatasetIterator<Dataset> {
      public:
@@ -94,16 +82,6 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
         *end_of_sequence = true;
         return Status::OK();
       }
-
-     protected:
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        return Status::OK();
-      }
-
-      Status RestoreInternal(OpKernelContext* ctx,
-                             IteratorStateReader* reader) override {
-        return Status::OK();
-      }
     };
 
     class FiniteIterator : public DatasetIterator<Dataset> {
@@ -118,11 +96,6 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
 
-        if (!input_impl_) {
-          *end_of_sequence = true;
-          return Status::OK();
-        }
-
         // Keep calling GetNext().  TODO(vrv): Figure out a way to
         // skip records without reading, perhaps by adding an
         // interface to iterator.
@@ -143,34 +116,6 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
         // Return GetNext() on the underlying iterator.
         TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, out_tensors,
                                                 end_of_sequence));
-        if (*end_of_sequence) {
-          input_impl_.reset();
-        }
-        return Status::OK();
-      }
-
-     protected:
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("i"), i_));
-        if (input_impl_) {
-          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
-        } else {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("input_impl_empty"), ""));
-        }
-        return Status::OK();
-      }
-
-      Status RestoreInternal(OpKernelContext* ctx,
-                             IteratorStateReader* reader) override {
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("i"), &i_));
-        if (!reader->Contains(full_name("input_impl_empty"))) {
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
-        } else {
-          input_impl_.reset();
-        }
         return Status::OK();
       }
 
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index d46701749b..28a379774b 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -190,41 +190,25 @@ class SliceOp : public OpKernel {
         }
         return;
       }
-#define HANDLE_DIM(NDIM)                            \
-  if (input_dims == NDIM) {                         \
-    HandleCase<NDIM>(context, begin, size, result); \
-    return;                                         \
+#define HANDLE_DIM(NDIM)                                              \
+  if (input_dims == NDIM) {                                           \
+    functor::Slice<Device, T, NDIM>()(                                \
+        context->eigen_device<Device>(), result, input, begin, size); \
+    return;                                                           \
   }
-
       HANDLE_DIM(1);
       HANDLE_DIM(2);
       HANDLE_DIM(3);
       HANDLE_DIM(4);
       HANDLE_DIM(5);
       HANDLE_DIM(6);
-      HANDLE_DIM(7);
 
 #undef HANDLE_DIM
 
-      OP_REQUIRES(context, false, errors::Unimplemented(
-                                      "SliceOp : Unhandled input dimensions"));
-    }
-  }
-
- private:
-  template <int NDIM>
-  void HandleCase(OpKernelContext* context, const gtl::ArraySlice<int64>& begin,
-                  const gtl::ArraySlice<int64>& size, Tensor* result) {
-    Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
-    Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes;
-    for (int i = 0; i < NDIM; ++i) {
-      indices[i] = begin[i];
-      sizes[i] = size[i];
+      // handle cases which dim >= 7
+      functor::Slice<Device, T, 7>()(
+          context->eigen_device<Device>(), result, input, begin, size);
     }
-
-    functor::Slice<Device, T, NDIM>()(
-        context->eigen_device<Device>(), result->tensor<T, NDIM>(),
-        context->input(0).tensor<T, NDIM>(), indices, sizes);
   }
 };
 
@@ -264,11 +248,16 @@ class MklSliceOp : public OpKernel {
         }
         return;
       }
-#define HANDLE_DIM(NDIM)                            \
-  if (input_dims == NDIM) {                         \
-    HandleCase<NDIM>(context, begin, size, result); \
-    return;                                         \
-  }
+      // Special case for handling 4-D tensor slice.
+      if (input_dims == 4) {
+        HandleCase4D(context, begin, size, result);
+      } else {
+#define HANDLE_DIM(NDIM)                                                  \
+      if (input_dims == NDIM) {                                           \
+        functor::Slice<Device, T, NDIM>()(                                \
+            context->eigen_device<Device>(), result, input, begin, size); \
+            return;                                                       \
+      }
 
       HANDLE_DIM(1);
       HANDLE_DIM(2);
@@ -276,12 +265,13 @@ class MklSliceOp : public OpKernel {
       HANDLE_DIM(4);
       HANDLE_DIM(5);
       HANDLE_DIM(6);
-      HANDLE_DIM(7);
 
 #undef HANDLE_DIM
 
-      OP_REQUIRES(context, false, errors::Unimplemented(
-                                      "SliceOp : Unhandled input dimensions"));
+        // handle cases which dim >= 7
+        functor::Slice<Device, T, 7>()(
+          context->eigen_device<Device>(), result, input, begin, size);
+      }
     }
   }
 
@@ -328,8 +318,7 @@ class MklSliceOp : public OpKernel {
     return false;
   }
 
-  template <int NDIM>
-  void HandleCase(OpKernelContext* context,
+  void HandleCase4D(OpKernelContext* context,
                   const gtl::ArraySlice<int64>& begin,
                   const gtl::ArraySlice<int64>& size, Tensor* result) {
     int slice_dim = -1;
@@ -338,8 +327,7 @@ class MklSliceOp : public OpKernel {
     // differs from the input tensor in only 1 out of 4 dimensions.
     // This case arises in the context of Slice of 4-D tensor in NHWC or NCHW
     // format over channel dimension.
-    if (NDIM == 4 &&
-        DoesSliceShapeDifferInOnly1D(in_shape, begin, size, &slice_dim)) {
+    if (DoesSliceShapeDifferInOnly1D(in_shape, begin, size, &slice_dim)) {
         size_t in_strides[4] = { (size_t) in_shape.dim_size(1) *
                                           in_shape.dim_size(2) *
                                           in_shape.dim_size(3),
@@ -403,16 +391,8 @@ class MklSliceOp : public OpKernel {
         // slice_dim is not 1 or 3, then we fallback to Eigen implementation.
     }
 
-    Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
-    Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes;
-    for (int i = 0; i < NDIM; ++i) {
-      indices[i] = begin[i];
-      sizes[i] = size[i];
-    }
-
-    functor::Slice<Device, T, NDIM>()(
-        context->eigen_device<Device>(), result->tensor<T, NDIM>(),
-        context->input(0).tensor<T, NDIM>(), indices, sizes);
+    functor::Slice<Device, T, 4>()(
+        context->eigen_device<Device>(), result, context->input(0), begin, size);
   }
 };
 #endif
@@ -420,13 +400,13 @@ class MklSliceOp : public OpKernel {
 // Forward declarations of the functor specializations for declared in the
 // sharded source files.
 namespace functor {
-#define DECLARE_CPU_SPEC(T, NDIM)                                  \
-  template <>                                                      \
-  void Slice<CPUDevice, T, NDIM>::operator()(                      \
-      const CPUDevice& d, typename TTypes<T, NDIM>::Tensor output, \
-      typename TTypes<T, NDIM>::ConstTensor input,                 \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,       \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);        \
+#define DECLARE_CPU_SPEC(T, NDIM)                        \
+  template <>                                            \
+  void Slice<CPUDevice, T, NDIM>::operator()(            \
+      const CPUDevice& d, Tensor* output,                \
+      const Tensor& input,                               \
+      const gtl::ArraySlice<int64>& slice_indices,       \
+      const gtl::ArraySlice<int64>& slice_sizes);        \
   extern template struct Slice<CPUDevice, T, NDIM>;
 
 #define DECLARE_FOR_N(T)  \
@@ -476,13 +456,14 @@ REGISTER_SLICE(bfloat16);
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T, NDIM)                                  \
-  template <>                                                      \
-  void Slice<GPUDevice, T, NDIM>::operator()(                      \
-      const GPUDevice& d, typename TTypes<T, NDIM>::Tensor output, \
-      typename TTypes<T, NDIM>::ConstTensor input,                 \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,       \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);        \
+#define DECLARE_GPU_SPEC(T, NDIM)                        \
+  template <>                                            \
+  void Slice<GPUDevice, T, NDIM>::operator()(            \
+      const GPUDevice& d,                                \
+      Tensor* output,                                    \
+      const Tensor& input,                               \
+      const gtl::ArraySlice<int64>& slice_indices,       \
+      const gtl::ArraySlice<int64>& slice_sizes);        \
   extern template struct Slice<GPUDevice, T, NDIM>;
 
 #define DECLARE_FOR_N(T)  \
@@ -536,13 +517,14 @@ REGISTER_KERNEL_BUILDER(Name("Slice")
 #ifdef TENSORFLOW_USE_SYCL
 // Forward declarations of the functor specializations for SYCL.
 namespace functor {
-#define DECLARE_SYCL_SPEC(T, NDIM)                                 \
-  template <>                                                      \
-  void Slice<SYCLDevice, T, NDIM>::operator()(                     \
-      const SYCLDevice& d, typename TTypes<T, NDIM>::Tensor output,\
-      typename TTypes<T, NDIM>::ConstTensor input,                 \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,       \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);        \
+#define DECLARE_SYCL_SPEC(T, NDIM)                       \
+  template <>                                            \
+  void Slice<SYCLDevice, T, NDIM>::operator()(           \
+      const SYCLDevice& d,                               \
+      Tensor* output,                                    \
+      const Tensor& input,                               \
+      const gtl::ArraySlice<int64>& slice_indices,       \
+      const gtl::ArraySlice<int64>& slice_sizes);        \
   extern template struct Slice<SYCLDevice, T, NDIM>;
 
 #define DECLARE_FOR_N(T)   \
diff --git a/tensorflow/core/kernels/slice_op.h b/tensorflow/core/kernels/slice_op.h
index db7eded745..55a4be985b 100644
--- a/tensorflow/core/kernels/slice_op.h
+++ b/tensorflow/core/kernels/slice_op.h
@@ -19,31 +19,104 @@ limitations under the License.
 // Functor definition for SliceOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/ops_util.h"
 
 namespace tensorflow {
-namespace functor {
+
+namespace internal {
+
+template <typename Device, typename T>
+void SliceSimple(const Device& d, Tensor* out, const Tensor& in,
+                 const gtl::ArraySlice<int64>& slice_indices);
+template <typename Device, typename T>
+void SliceSimpleGpu(const Device& d, Tensor* out, const Tensor& in,
+                 const gtl::ArraySlice<int64>& slice_indices);
+
+template <typename Device, typename T>
+void SliceSimple(const Device& d, Tensor* out, const Tensor& in,
+                 const gtl::ArraySlice<int64>& slice_indices) {
+  const int ndims = in.dims();
+  const int64 nelem = out->NumElements();
+  const gtl::InlinedVector<int64, 8> in_strides = ComputeStride<int64>(in.shape());
+  const gtl::InlinedVector<int64, 8> out_strides = ComputeStride<int64>(out->shape());
+  const T* p = in.flat<T>().data();
+  T* q = out->flat<T>().data();
+
+  std::vector<int64> i_idx(nelem, 0);
+  std::vector<int64> t(nelem, 0);
+
+  for (int64 o_idx = 0; o_idx < nelem; ++o_idx) {
+    t[o_idx] = o_idx;
+  }
+  for (int i = 0; i < ndims; ++i) {
+    int64 n = (nelem + 7) / 8;
+    int64 o_idx = 0;
+    switch (nelem % 8) {
+#define CALC_INPUT_IDX                                                            \
+  i_idx[o_idx] += (t[o_idx] / out_strides[i] + slice_indices[i]) * in_strides[i]; \
+  t[o_idx] %= out_strides[i];                                                     \
+  ++o_idx;
+      case 0: do { CALC_INPUT_IDX;
+      case 7:      CALC_INPUT_IDX;
+      case 6:      CALC_INPUT_IDX;
+      case 5:      CALC_INPUT_IDX;
+      case 4:      CALC_INPUT_IDX;
+      case 3:      CALC_INPUT_IDX;
+      case 2:      CALC_INPUT_IDX;
+      case 1:      CALC_INPUT_IDX;
+#undef CALC_INPUT_IDX
+              } while (--n > 0);
+    }
+  }
+  for (int64 o_idx = 0; o_idx < nelem; ++o_idx) {
+    q[o_idx] = p[i_idx[o_idx]];
+  }
+}
 
 template <typename Device, typename T, int NDIMS>
+void SliceUsingEigen(const Device& d, Tensor* out, const Tensor& in,
+                 const gtl::ArraySlice<int64>& slice_indices,
+                 const gtl::ArraySlice<int64>& slice_sizes) {
+  auto input = in.tensor<T, NDIMS>();
+  auto output = out->tensor<T, NDIMS>();
+  Eigen::DSizes<int, NDIMS> indices;
+  for (int i = 0; i < NDIMS; ++i) {
+    indices[i] = slice_indices[i];
+  }
+  Eigen::DSizes<int, NDIMS> sizes;
+  for (int i = 0; i < NDIMS; ++i) {
+    sizes[i] = slice_sizes[i];
+  }
+  const bool use_64bit = input.size() > Eigen::NumTraits<int>::highest();
+  if (!use_64bit &&
+      Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
+    To32Bit(output).device(d) = To32Bit(input).slice(indices, sizes);
+  } else {
+    output.device(d) = input.slice(indices, sizes);
+  }
+}
+
+} // namespace internal
+
+namespace functor {
+
+// Template parameter NDIM is not neccesary here. The aim of keeping it
+// is to compile struct slice seperately which minimizes the compiling time.
+template <typename Device, typename T, int NDIM>
 struct Slice {
-  void operator()(const Device& d, typename TTypes<T, NDIMS>::Tensor output,
-                  typename TTypes<T, NDIMS>::ConstTensor input,
-                  const Eigen::DSizes<Eigen::DenseIndex, NDIMS>& slice_indices,
-                  const Eigen::DSizes<Eigen::DenseIndex, NDIMS>& slice_sizes) {
-    bool use_64bit = (input.size() > Eigen::NumTraits<int>::highest());
-    if (!use_64bit &&
-        Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
-      Eigen::DSizes<int, NDIMS> indices;
-      for (int i = 0; i < NDIMS; ++i) {
-        indices[i] = slice_indices[i];
-      }
-      Eigen::DSizes<int, NDIMS> sizes;
-      for (int i = 0; i < NDIMS; ++i) {
-        sizes[i] = slice_sizes[i];
-      }
-      To32Bit(output).device(d) = To32Bit(input).slice(indices, sizes);
+  void operator()(const Device& d, Tensor* out, const Tensor& in,
+                  const gtl::ArraySlice<int64>& slice_indices,
+                  const gtl::ArraySlice<int64>& slice_sizes) {
+    if (in.dims() == NDIM) {
+        internal::SliceUsingEigen<Device, T, NDIM>(d, out, in, slice_indices, slice_sizes);
     } else {
-      output.device(d) = input.slice(slice_indices, slice_sizes);
+        if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
+          internal::SliceSimpleGpu<Device, T>(d, out, in, slice_indices);
+        } else {
+          internal::SliceSimple<Device, T>(d, out, in, slice_indices);
+        }
     }
   }
 };
diff --git a/tensorflow/core/kernels/slice_op_gpu.cu.cc b/tensorflow/core/kernels/slice_op_gpu.cu.cc
index a301986f2f..3039b3d777 100644
--- a/tensorflow/core/kernels/slice_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/slice_op_gpu.cu.cc
@@ -21,9 +21,65 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
+namespace internal {
+
+template <typename T>
+__global__ void SliceKernel(int nthreads, const T* src, const int32* buf,
+                            const int32 ndims, T* dst) {
+  const int32* in_strides = buf;
+  const int32* out_strides = buf + ndims;
+  const int32* slice_indices = buf + ndims * 2;
+  CUDA_1D_KERNEL_LOOP(o_idx, nthreads) {
+    int32 i_idx = 0;
+    int32 t = o_idx;
+    for (int i = 0; i < ndims; ++i) {
+      i_idx += (t / out_strides[i] + slice_indices[i]) * in_strides[i];
+      t %= out_strides[i];
+    }
+    dst[o_idx] = ldg(src + i_idx);
+  }
+}
+
+template <typename Device, typename T>
+void SliceSimpleGpu(const Device& d, Tensor* out, const Tensor& in,
+                 const gtl::ArraySlice<int64>& slice_indices) {
+  // Ensures we can use 32-bit index.
+  const int64 in_nelem = in.NumElements();
+  CHECK_LT(in_nelem, kint32max) << "Tensor too large to transpose on GPU";
+  const int64 out_nelem = out->NumElements();
+  CHECK_LT(out_nelem, kint32max) << "Tensor too large to transpose on GPU";
+  // Pack strides and slice indices sizes into one buffer.
+  const int32 ndims = in.dims();
+  gtl::InlinedVector<int32, 24> host_buf(ndims * 3);
+  gtl::InlinedVector<int32, 8> in_strides = ComputeStride<int32>(in.shape());
+  gtl::InlinedVector<int32, 8> out_strides = ComputeStride<int32>(out->shape());
+  for (int i = 0; i < ndims; ++i) {
+    host_buf[i] = in_strides[i];
+    host_buf[ndims + i] = out_strides[i];
+    host_buf[ndims * 2 + i] = slice_indices[i];
+  }
+  auto num_bytes = sizeof(int64) * host_buf.size();
+  auto dev_buf = d.allocate(num_bytes);
+  // NOTE: host_buf is not allocated by CudaHostAllocator, and
+  // therefore we are doing a sync copy effectively.
+  d.memcpyHostToDevice(dev_buf, host_buf.data(), num_bytes);
+  // Launch kernel to q[...] = p[...].
+  const T* p = in.flat<T>().data();
+  T* q = out->flat<T>().data();
+  CudaLaunchConfig cfg = GetCudaLaunchConfig(out_nelem, d);
+  SliceKernel<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
+      cfg.virtual_thread_count, p, reinterpret_cast<const int32*>(dev_buf),
+      ndims, q);
+  // Safe to deallocate immediately after the kernel launch.
+  d.deallocate(dev_buf);
+}
+
+} // namespace internal
 
 typedef Eigen::GpuDevice GPUDevice;
 
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 73b6d4cf6a..8fc40db3cc 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -427,7 +427,6 @@ REGISTER_STRIDED_SLICE(bfloat16);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_complex64(REGISTER_GPU);
 TF_CALL_complex128(REGISTER_GPU);
-TF_CALL_int64(REGISTER_GPU);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index afe3a051e6..7d42887426 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -84,16 +84,16 @@ void HandleStridedSliceCase(OpKernelContext* context,
 
   gtl::InlinedVector<int64, 4> processing_dims = processing_shape.dim_sizes();
   if (is_simple_slice) {
-    Eigen::DSizes<Eigen::DenseIndex, NDIM> begin_di;
-    Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes_di;
+    gtl::InlinedVector<int64, 4> sizes(begin.size());
     for (int i = 0; i < NDIM; ++i) {
-      begin_di[i] = begin[i];
-      sizes_di[i] = end[i] - begin[i];
+      sizes[i] = end[i] - begin[i];
     }
-    functor::Slice<Device, Proxy, NDIM>()(
-        context->eigen_device<Device>(),
-        result->bit_casted_shaped<Proxy, NDIM>(processing_dims),
-        context->input(0).bit_casted_tensor<Proxy, NDIM>(), begin_di, sizes_di);
+    const TensorShape final_shape = result->shape();
+    CHECK(result->CopyFrom(*result, processing_shape));
+    const Tensor input = context->input(0);
+    functor::Slice<Device, T, NDIM>()(
+        context->eigen_device<Device>(), result, input, begin, sizes);
+    CHECK(result->CopyFrom(*result, final_shape));
   } else {
     Eigen::DSizes<Eigen::DenseIndex, NDIM> begin_di;
     Eigen::DSizes<Eigen::DenseIndex, NDIM> end_di;
@@ -196,10 +196,9 @@ class HandleStridedSliceAssignCase<Device, T, 0> {
   extern template struct StridedSlice<GPUDevice, T, NDIM>;         \
   template <>                                                      \
   void Slice<GPUDevice, T, NDIM>::operator()(                      \
-      const GPUDevice& d, typename TTypes<T, NDIM>::Tensor output, \
-      typename TTypes<T, NDIM>::ConstTensor input,                 \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,       \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);        \
+      const GPUDevice& d, Tensor* output, const Tensor& input,     \
+      const gtl::ArraySlice<int64>& slice_indices,                 \
+      const gtl::ArraySlice<int64>& slice_sizes);                  \
   extern template struct Slice<GPUDevice, T, NDIM>;                \
   template <>                                                      \
   void StridedSliceGrad<GPUDevice, T, NDIM>::operator()(           \
@@ -284,7 +283,6 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N_GPU);
 TF_CALL_complex64(DECLARE_FOR_N_GPU);
 TF_CALL_complex128(DECLARE_FOR_N_GPU);
 DECLARE_FOR_N_GPU(int32);
-DECLARE_FOR_N_GPU(int64);
 #endif  // END GOOGLE_CUDA
 
 TF_CALL_ALL_TYPES(DECLARE_FOR_N_CPU);
@@ -300,7 +298,6 @@ DECLARE_FOR_N_CPU(bfloat16);
 TF_CALL_SYCL_PROXY_TYPES(PREVENT_FOR_N_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_FOR_N_SYCL);
 DECLARE_FOR_N_SYCL(int32);
-DECLARE_FOR_N_SYCL(int64);
 
 #undef DECLARE_FOR_N_SYCL
 #endif // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/strided_slice_op_test.cc b/tensorflow/core/kernels/strided_slice_op_test.cc
index 281ca0f58f..78bb15463c 100644
--- a/tensorflow/core/kernels/strided_slice_op_test.cc
+++ b/tensorflow/core/kernels/strided_slice_op_test.cc
@@ -76,20 +76,69 @@ static void SliceHelper(int iters, int size) {
   testing::UseRealTime();
 }
 
+template <typename T>
+static void Dim8SliceHelper(int iters, int size) {
+  testing::StopTiming();
+  Graph* g = new Graph(OpRegistry::Global());
+  DataType dt = DataTypeToEnum<T>::v();
+  int kDim = 100;
+  int kMaxSize = 15000;
+  CHECK_LT(size, kMaxSize);
+
+  Tensor begin(DT_INT32, TensorShape({8}));
+  begin.flat<int32>()(10) = 10;
+  for (int i = 1; i < 7; ++i) {
+    begin.flat<int32>()(i) = 0;
+  }
+  begin.flat<int32>()(7) = 10;
+
+  Tensor end(DT_INT32, TensorShape({8}));
+  end.flat<int32>()(0) = 10 + kDim;
+  for (int i = 1; i < 7; ++i) {
+    end.flat<int32>()(i) = 1;
+  }
+  end.flat<int32>()(7) = 10 + size;
+
+  Tensor strides(DT_INT32, TensorShape({8}));
+  for (int i = 0; i < 8; ++i) {
+    strides.flat<int32>()(i) = 1;
+  }
+
+  Tensor input(dt, TensorShape({2*kDim, 1, 1, 1, 1, 1, 1, kMaxSize}));
+  input.flat<T>().setRandom();
+
+  Node* node;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "StridedSlice")
+                  .Input(test::graph::Constant(g, input))
+                  .Input(test::graph::Constant(g, begin))
+                  .Input(test::graph::Constant(g, end))
+                  .Input(test::graph::Constant(g, strides))
+                  .Attr("T", dt)
+                  .Finalize(g, &node));
+
+  testing::BytesProcessed(static_cast<int64>(iters) * kDim * size * sizeof(T));
+  testing::StartTiming();
+  test::Benchmark("cpu", g).Run(iters);
+  testing::UseRealTime();
+}
+
 static void BM_SliceFloat(int iters, int dim2) {
   SliceHelper<float>(iters, dim2);
+  Dim8SliceHelper<float>(iters, dim2);
 }
 
 BENCHMARK(BM_SliceFloat)->Arg(100)->Arg(1000)->Arg(10000);
 
 static void BM_SliceComplex64(int iters, int dim2) {
   SliceHelper<std::complex<float>>(iters, dim2);
+  Dim8SliceHelper<std::complex<float>>(iters, dim2);
 }
 
 BENCHMARK(BM_SliceComplex64)->Arg(100)->Arg(1000)->Arg(10000);
 
 static void BM_SliceBFloat16(int iters, int dim2) {
   SliceHelper<bfloat16>(iters, dim2);
+  Dim8SliceHelper<bfloat16>(iters, dim2);
 }
 
 BENCHMARK(BM_SliceBFloat16)->Arg(100)->Arg(1000)->Arg(10000);
diff --git a/tensorflow/core/kernels/summary_interface.cc b/tensorflow/core/kernels/summary_interface.cc
index cd366f8c13..313137ae49 100644
--- a/tensorflow/core/kernels/summary_interface.cc
+++ b/tensorflow/core/kernels/summary_interface.cc
@@ -257,9 +257,7 @@ class SummaryWriterImpl : public SummaryWriterInterface {
     Summary::Value* v = e->mutable_summary()->add_value();
     t.AsProtoTensorContent(v->mutable_tensor());
     v->set_tag(tag);
-    if (!serialized_metadata.empty()) {
-      v->mutable_metadata()->ParseFromString(serialized_metadata);
-    }
+    v->mutable_metadata()->ParseFromString(serialized_metadata);
     return WriteEvent(std::move(e));
   }
 
diff --git a/tensorflow/core/kernels/summary_kernels.cc b/tensorflow/core/kernels/summary_kernels.cc
index 1fe2fc5b66..cfa707de71 100644
--- a/tensorflow/core/kernels/summary_kernels.cc
+++ b/tensorflow/core/kernels/summary_kernels.cc
@@ -13,12 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorboard/db/summary_db_writer.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/kernels/summary_interface.h"
-#include "tensorflow/core/lib/db/sqlite.h"
-#include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 
@@ -49,32 +46,6 @@ class CreateSummaryFileWriterOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("CreateSummaryFileWriter").Device(DEVICE_CPU),
                         CreateSummaryFileWriterOp);
 
-class CreateSummaryDbWriterOp : public OpKernel {
- public:
-  explicit CreateSummaryDbWriterOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor* tmp;
-    OP_REQUIRES_OK(ctx, ctx->input("db_uri", &tmp));
-    const string db_uri = tmp->scalar<string>()();
-    OP_REQUIRES_OK(ctx, ctx->input("experiment_name", &tmp));
-    const string experiment_name = tmp->scalar<string>()();
-    OP_REQUIRES_OK(ctx, ctx->input("run_name", &tmp));
-    const string run_name = tmp->scalar<string>()();
-    OP_REQUIRES_OK(ctx, ctx->input("user_name", &tmp));
-    const string user_name = tmp->scalar<string>()();
-    SummaryWriterInterface* s;
-    auto db = Sqlite::Open(db_uri);
-    OP_REQUIRES_OK(ctx, db.status());
-    OP_REQUIRES_OK(
-        ctx, CreateSummaryDbWriter(std::move(db.ValueOrDie()), experiment_name,
-                                   run_name, user_name, ctx->env(), &s));
-    OP_REQUIRES_OK(ctx, CreateResource(ctx, HandleFromInput(ctx, 0), s));
-  }
-};
-REGISTER_KERNEL_BUILDER(Name("CreateSummaryDbWriter").Device(DEVICE_CPU),
-                        CreateSummaryDbWriterOp);
-
 class FlushSummaryWriterOp : public OpKernel {
  public:
   explicit FlushSummaryWriterOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
@@ -127,27 +98,6 @@ class WriteSummaryOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("WriteSummary").Device(DEVICE_CPU),
                         WriteSummaryOp);
 
-class ImportEventOp : public OpKernel {
- public:
-  explicit ImportEventOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    SummaryWriterInterface* s;
-    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s));
-    core::ScopedUnref unref(s);
-    const Tensor* t;
-    OP_REQUIRES_OK(ctx, ctx->input("event", &t));
-    std::unique_ptr<Event> event{new Event};
-    if (!ParseProtoUnlimited(event.get(), t->scalar<string>()())) {
-      ctx->CtxFailureWithWarning(
-          errors::DataLoss("Bad tf.Event binary proto tensor string"));
-      return;
-    }
-    OP_REQUIRES_OK(ctx, s->WriteEvent(std::move(event)));
-  }
-};
-REGISTER_KERNEL_BUILDER(Name("ImportEvent").Device(DEVICE_CPU), ImportEventOp);
-
 class WriteScalarSummaryOp : public OpKernel {
  public:
   explicit WriteScalarSummaryOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
diff --git a/tensorflow/core/kernels/take_dataset_op.cc b/tensorflow/core/kernels/take_dataset_op.cc
index fb294a96b1..c3f33d663c 100644
--- a/tensorflow/core/kernels/take_dataset_op.cc
+++ b/tensorflow/core/kernels/take_dataset_op.cc
@@ -35,14 +35,14 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
     // Create a new TakeDatasetOp::Dataset, and return it as the output.
     int64 count;
     OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "count", &count));
-    *output = new Dataset(ctx, count, input);
+    *output = new Dataset(count, input);
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
-    Dataset(OpKernelContext* ctx, int64 count, const DatasetBase* input)
-        : GraphDatasetBase(ctx), count_(count), input_(input) {
+    Dataset(int64 count, const DatasetBase* input)
+        : count_(count), input_(input) {
       input_->Ref();
     }
 
@@ -72,18 +72,6 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() override { return "TakeDatasetOp::Dataset"; }
 
-   protected:
-    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(input_, &input_graph_node));
-      Node* count = nullptr;
-      TF_RETURN_IF_ERROR(b->AddScalar(count_, &count));
-      TF_RETURN_IF_ERROR(
-          b->AddDataset(this, {input_graph_node, count}, output));
-      return Status::OK();
-    }
-
    private:
     class EmptyIterator : public DatasetIterator<Dataset> {
      public:
@@ -95,16 +83,6 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
         *end_of_sequence = true;
         return Status::OK();
       }
-
-     protected:
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        return Status::OK();
-      }
-
-      Status RestoreInternal(OpKernelContext* ctx,
-                             IteratorStateReader* reader) override {
-        return Status::OK();
-      }
     };
 
     class FiniteIterator : public DatasetIterator<Dataset> {
@@ -118,10 +96,6 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
-        if (!input_impl_) {
-          *end_of_sequence = true;
-          return Status::OK();
-        }
         while (i_ < dataset()->count_) {
           TF_RETURN_IF_ERROR(
               input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
@@ -136,31 +110,6 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-     protected:
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("i"), i_));
-        if (input_impl_) {
-          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
-        } else {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("input_impl_empty"), ""));
-        }
-        return Status::OK();
-      }
-
-      Status RestoreInternal(OpKernelContext* ctx,
-                             IteratorStateReader* reader) override {
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("i"), &i_));
-        if (!reader->Contains(full_name("input_impl_empty"))) {
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
-        } else {
-          input_impl_.reset();
-        }
-        return Status::OK();
-      }
-
      private:
       mutex mu_;
       int64 i_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index 20f0edf309..96c051c636 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -31,13 +31,14 @@ limitations under the License.
 
 namespace tensorflow {
 
-// inv = InvertPermutationOp(T<int32> p) takes a permutation of
+// inv = InvertPermutationOp(T<int32/int64> p) takes a permutation of
 // integers 0, 1, ..., n - 1 and returns the inverted
 // permutation of p. I.e., inv[p[i]] == i, for i in [0 .. n).
 //
-// REQUIRES: input is a vector of int32.
+// REQUIRES: input is a vector of int32 or int64.
 // REQUIRES: input is a permutation of 0, 1, ..., n-1.
 
+template <typename T>
 class InvertPermutationOp : public OpKernel {
  public:
   explicit InvertPermutationOp(OpKernelConstruction* context)
@@ -48,20 +49,19 @@ class InvertPermutationOp : public OpKernel {
     OP_REQUIRES(
         context, TensorShapeUtils::IsVector(input.shape()),
         errors::InvalidArgument("invert_permutation expects a 1D vector."));
-    auto Tin = input.vec<int32>();
+    auto Tin = input.vec<T>();
     OP_REQUIRES(context,
                 FastBoundsCheck(Tin.size(), std::numeric_limits<int32>::max()),
                 errors::InvalidArgument("permutation of nonnegative int32s "
                                         "must have <= int32 max elements"));
-    const int32 N =
-        static_cast<int32>(Tin.size());  // Safe: bounds-checked above.
+    const T N = static_cast<T>(Tin.size());  // Safe: bounds-checked above.
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input.shape(), &output));
-    auto Tout = output->vec<int32>();
+    auto Tout = output->vec<T>();
     std::fill_n(Tout.data(), N, -1);
     for (int i = 0; i < N; ++i) {
-      const int32 d = internal::SubtleMustCopy(Tin(i));
+      const T d = internal::SubtleMustCopy(Tin(i));
       OP_REQUIRES(context, FastBoundsCheck(d, N),
                   errors::InvalidArgument(d, " is not between 0 and ", N));
       OP_REQUIRES(context, Tout(d) == -1,
@@ -73,14 +73,23 @@ class InvertPermutationOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(
     Name("InvertPermutation").Device(DEVICE_CPU).TypeConstraint<int32>("T"),
-    InvertPermutationOp);
+    InvertPermutationOp<int32>);
+REGISTER_KERNEL_BUILDER(
+    Name("InvertPermutation").Device(DEVICE_CPU).TypeConstraint<int64>("T"),
+    InvertPermutationOp<int64>);
 
 REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<int32>("T")
                             .HostMemory("x")
                             .HostMemory("y"),
-                        InvertPermutationOp);
+                        InvertPermutationOp<int32>);
+REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int64>("T")
+                            .HostMemory("x")
+                            .HostMemory("y"),
+                        InvertPermutationOp<int64>);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
@@ -88,7 +97,13 @@ REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
                             .TypeConstraint<int32>("T")
                             .HostMemory("x")
                             .HostMemory("y"),
-                        InvertPermutationOp);
+                        InvertPermutationOp<int32>);
+REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int64>("T")
+                            .HostMemory("x")
+                            .HostMemory("y"),
+                        InvertPermutationOp<int64>);
 #endif  // TENSORFLOW_USE_SYCL
 
 namespace {
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index 701c5f6d2b..d087784c8a 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <functional>
 #include <unordered_map>
 #include <utility>
 
@@ -21,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/hash/hash.h"
 
 namespace tensorflow {
 
@@ -33,8 +35,6 @@ class UniqueOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
-    OP_REQUIRES(context, TensorShapeUtils::IsVector(input.shape()),
-                errors::InvalidArgument("unique expects a 1D vector."));
     // TODO(dga):  Make unique polymorphic for returning int32 and int64
     // vectors to support large tensors.
     OP_REQUIRES(context,
@@ -42,31 +42,102 @@ class UniqueOp : public OpKernel {
                 errors::InvalidArgument(
                     "unique does not support input tensors larger than ",
                     std::numeric_limits<int32>::max(), " elements"));
-    auto Tin = input.vec<T>();
-    const int64 N = static_cast<int64>(Tin.size());
+
+    int64 axis = 0;
+    std::vector<int64> new_sizes{1, input.NumElements(), 1};
+    if (context->num_inputs() == 1) {
+      OP_REQUIRES(context, TensorShapeUtils::IsVector(input.shape()),
+                  errors::InvalidArgument("unique expects a 1D vector."));
+    } else {
+      // In case of UniqueV2, the axis is a 1D vector. The purpose is
+      // to allow specifying either "no axis" or "axis". The `[]` means
+      // "no axis", while `[x]` means `axis = x`.
+      const Tensor& axis_tensor = context->input(1);
+      OP_REQUIRES(context, TensorShapeUtils::IsVector(axis_tensor.shape()),
+                  errors::InvalidArgument("axis expects a 1D vector."));
+      OP_REQUIRES(
+          context, axis_tensor.NumElements() <= 1,
+          errors::InvalidArgument(
+              "axis does not support input tensors larger than 1 elements"));
+      if (axis_tensor.NumElements() == 0) {
+        OP_REQUIRES(context, TensorShapeUtils::IsVector(input.shape()),
+                    errors::InvalidArgument("unique expects a 1D vector."));
+      } else {
+        auto axis_vec = axis_tensor.vec<int64>();
+        axis = axis_vec(0);
+        axis = axis < 0 ? axis + input.dims() : axis;
+        OP_REQUIRES(context, 0 <= axis && axis < input.dims(),
+                    errors::InvalidArgument("axis has to be between [0, ",
+                                            input.dims(), ")"));
+        if (axis > 0) {
+          for (int64 i = 0; i < axis; i++) {
+            new_sizes[0] *= input.dim_size(i);
+          }
+        }
+        new_sizes[1] = input.dim_size(axis);
+        if (axis + 1 < input.dims()) {
+          for (int64 i = axis + 1; i < input.dims(); i++) {
+            new_sizes[2] *= input.dim_size(i);
+          }
+        }
+      }
+    }
+
+    auto Tin = input.shaped<T, 3>(new_sizes);
 
     Tensor* idx = nullptr;
-    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                                {0}, 1, input.shape(), &idx));
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                1, TensorShape({Tin.dimension(1)}), &idx));
     auto idx_vec = idx->template vec<TIndex>();
 
-    std::unordered_map<T, TIndex> uniq;
-    uniq.reserve(2 * N);
-    for (int64 i = 0, j = 0; i < N; ++i) {
-      auto it = uniq.insert(std::make_pair(Tin(i), j));
+    auto hash_fn = [&Tin](const int64& key) -> unsigned long {
+      size_t h = 0;
+      for (int64 i = 0; i < Tin.dimension(0); i++) {
+        for (int64 j = 0; j < Tin.dimension(2); j++) {
+          h = Hash64Combine(h, hash<T>{}(Tin(i, key, j)));
+        }
+      }
+      return h;
+    };
+
+    auto equal_to_fn = [&Tin](const int64& lhs, const int64& rhs) {
+      for (int64 i = 0; i < Tin.dimension(0); i++) {
+        for (int64 j = 0; j < Tin.dimension(2); j++) {
+          if (Tin(i, lhs, j) != Tin(i, rhs, j)) {
+            return false;
+          }
+        }
+      }
+      return true;
+    };
+
+    std::unordered_map<int64, int64, decltype(hash_fn), decltype(equal_to_fn)>
+        uniq(0, hash_fn, equal_to_fn);
+
+    uniq.reserve(2 * Tin.dimension(1));
+
+    for (int64 i = 0, j = 0; i < Tin.dimension(1); ++i) {
+      auto it = uniq.insert(std::make_pair(i, j));
       idx_vec(i) = it.first->second;
       if (it.second) {
         ++j;
       }
     }
+
     int64 uniq_size = static_cast<int64>(uniq.size());
+    new_sizes[1] = uniq_size;
+    TensorShape output_shape(input.shape());
+    output_shape.set_dim(axis, uniq_size);
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(
-                                0, TensorShape({uniq_size}), &output));
-    auto output_vec = output->template vec<T>();
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    auto Tout = output->shaped<T, 3>(new_sizes);
 
     for (auto it : uniq) {
-      output_vec(it.second) = it.first;
+      for (int64 i = 0; i < Tin.dimension(0); i++) {
+        for (int64 j = 0; j < Tin.dimension(2); j++) {
+          Tout(i, it.second, j) = Tin(i, it.first, j);
+        }
+      }
     }
 
     if (num_outputs() > 2) {
@@ -74,7 +145,7 @@ class UniqueOp : public OpKernel {
                                   2, TensorShape({uniq_size}), &output));
       auto count_output_vec = output->template vec<TIndex>();
       count_output_vec.setZero();
-      for (int64 i = 0; i < N; ++i) {
+      for (int64 i = 0; i < Tin.dimension(1); ++i) {
         count_output_vec(idx_vec(i))++;
       }
     }
@@ -92,6 +163,16 @@ class UniqueOp : public OpKernel {
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
                           UniqueOp<type, int64>);                \
+  REGISTER_KERNEL_BUILDER(Name("UniqueV2")                       \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("out_idx"), \
+                          UniqueOp<type, int32>);                \
+  REGISTER_KERNEL_BUILDER(Name("UniqueV2")                       \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("out_idx"), \
+                          UniqueOp<type, int64>);                \
   REGISTER_KERNEL_BUILDER(Name("UniqueWithCounts")               \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
@@ -176,5 +257,5 @@ REGISTER_KERNEL_BUILDER(Name("Unique")
                             .HostMemory("y")
                             .HostMemory("idx"),
                         UniqueOp<int64, int64>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/zip_dataset_op.cc b/tensorflow/core/kernels/zip_dataset_op.cc
index f466c8b268..a80b9edbe4 100644
--- a/tensorflow/core/kernels/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/zip_dataset_op.cc
@@ -35,15 +35,14 @@ class ZipDatasetOp : public DatasetOpKernel {
       OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(i), &input));
       inputs.push_back(input);
     }
-    *output = new Dataset(ctx, inputs);
+    *output = new Dataset(inputs);
   }
 
  private:
-  class Dataset : public GraphDatasetBase {
+  class Dataset : public DatasetBase {
    public:
-    explicit Dataset(OpKernelContext* ctx,
-                     const std::vector<DatasetBase*>& inputs)
-        : GraphDatasetBase(ctx), inputs_(inputs) {
+    explicit Dataset(const std::vector<DatasetBase*>& inputs)
+        : inputs_(inputs) {
       for (const auto& input : inputs_) {
         input->Ref();
         for (DataType dt : input->output_dtypes()) {
@@ -77,21 +76,6 @@ class ZipDatasetOp : public DatasetOpKernel {
 
     string DebugString() override { return "ZipDatasetOp::Dataset"; }
 
-   protected:
-    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      std::vector<NodeBuilder::NodeOut> input_graph_nodes;
-      input_graph_nodes.reserve(inputs_.size());
-      for (const auto& input : inputs_) {
-        Node* input_node;
-        TF_RETURN_IF_ERROR(b->AddParentDataset(input, &input_node));
-        input_graph_nodes.emplace_back(input_node);
-      }
-      TF_RETURN_IF_ERROR(
-          b->AddDatasetWithInputAsList(this, input_graph_nodes, output));
-      return Status::OK();
-    }
-
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
@@ -109,10 +93,6 @@ class ZipDatasetOp : public DatasetOpKernel {
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);
-        if (input_impls_.empty()) {
-          *end_of_sequence = true;
-          return Status::OK();
-        }
         out_tensors->clear();
         out_tensors->reserve(dataset()->output_dtypes().size());
         for (const auto& input_impl : input_impls_) {
@@ -120,43 +100,12 @@ class ZipDatasetOp : public DatasetOpKernel {
           TF_RETURN_IF_ERROR(
               input_impl->GetNext(ctx, &input_tensors, end_of_sequence));
           if (*end_of_sequence) {
-            break;
+            return Status::OK();
           }
           out_tensors->insert(out_tensors->end(), input_tensors.begin(),
                               input_tensors.end());
         }
-        if (*end_of_sequence) {
-          out_tensors->clear();
-          input_impls_.clear();
-        } else {
-          *end_of_sequence = false;
-        }
-        return Status::OK();
-      }
-
-     protected:
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        mutex_lock l(mu_);
-        if (input_impls_.empty()) {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("input_impls_empty"), ""));
-        } else {
-          for (auto& input_impl : input_impls_)
-            TF_RETURN_IF_ERROR(SaveParent(writer, input_impl));
-        }
-        return Status::OK();
-      }
-
-      Status RestoreInternal(OpKernelContext* ctx,
-                             IteratorStateReader* reader) override {
-        mutex_lock l(mu_);
-        if (reader->Contains(full_name("input_impls_empty"))) {
-          input_impls_.clear();
-        } else {
-          DCHECK_EQ(input_impls_.size(), dataset()->inputs_.size());
-          for (auto& input_impl : input_impls_)
-            TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl));
-        }
+        *end_of_sequence = false;
         return Status::OK();
       }
 
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index cdf370399c..c8cc147360 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -723,7 +723,9 @@ y: a tensor of the same shape and type as x but filled with zeros.
 REGISTER_OP("OnesLike")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {float, double, int32, int64, complex64, complex128}")
+    .Attr(
+        "T: {float, double, int8, uint8, int16, uint16, int32, int64, "
+        "complex64, complex128, bool}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Returns a tensor of ones with the same shape and type as x.
@@ -2031,6 +2033,46 @@ y: 1-D.
 idx: 1-D.
 )doc");
 
+REGISTER_OP("UniqueV2")
+    .Input("x: T")
+    .Input("axis: int64")
+    .Output("y: T")
+    .Output("idx: out_idx")
+    .Attr("T: type")
+    .Attr("out_idx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
+      c->set_output(1, c->input(0));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Finds unique elements in a 1-D tensor.
+
+This operation returns a tensor `y` containing all of the unique elements of `x`
+sorted in the same order that they occur in `x`. This operation also returns a
+tensor `idx` the same size as `x` that contains the index of each value of `x`
+in the unique output `y`. In other words:
+
+`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+
+For example:
+
+```
+# tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+y, idx = unique(x)
+y ==> [1, 2, 4, 7, 8]
+idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+```
+
+
+x: A `Tensor`.
+axis: A `Tensor` of type `int64` (default: 0). The axis of the Tensor to
+  find the unique elements.
+y: A `Tensor`. Unique elements along the `axis` of `Tensor` x.
+idx: A 1-D Tensor. Has the same type as x that contains the index of each
+  value of x in the output y.
+)doc");
+
 // --------------------------------------------------------------------------
 REGISTER_OP("UniqueWithCounts")
     .Input("x: T")
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 60f67543f1..8b8251f84b 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -8271,29 +8271,6 @@ op {
   }
 }
 op {
-  name: "DatasetToSingleElement"
-  input_arg {
-    name: "dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "components"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
   name: "DebugGradientIdentity"
   input_arg {
     name: "input"
@@ -9272,69 +9249,6 @@ op {
   }
 }
 op {
-  name: "DenseToSparseBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "row_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "DenseToSparseBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "row_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
   name: "DenseToSparseSetOperation"
   input_arg {
     name: "set1"
@@ -9828,18 +9742,6 @@ op {
   }
 }
 op {
-  name: "DeserializeIterator"
-  input_arg {
-    name: "resource_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "serialized"
-    type: DT_VARIANT
-  }
-  is_stateful: true
-}
-op {
   name: "DeserializeManySparse"
   input_arg {
     name: "serialized_sparse"
@@ -13593,131 +13495,6 @@ op {
   }
 }
 op {
-  name: "GroupByWindowDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "window_size_func_other_arguments"
-    type_list_attr: "Twindow_size_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "window_size_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Twindow_size_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "GroupByWindowDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "window_size_func_other_arguments"
-    type_list_attr: "Twindow_size_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "window_size_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Twindow_size_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
   name: "HSVToRGB"
   input_arg {
     name: "images"
@@ -14138,53 +13915,6 @@ op {
   }
 }
 op {
-  name: "IgnoreErrorsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "IgnoreErrorsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
   name: "Imag"
   input_arg {
     name: "input"
@@ -16089,50 +15819,6 @@ op {
   is_stateful: true
 }
 op {
-  name: "MapAndBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_parallel_batches"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
   name: "MapClear"
   attr {
     name: "capacity"
@@ -20871,54 +20557,6 @@ op {
   }
 }
 op {
-  name: "ParallelInterleaveDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "cycle_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "block_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sloppy"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
   name: "ParallelMapDataset"
   input_arg {
     name: "input_dataset"
@@ -21671,52 +21309,6 @@ op {
   is_stateful: true
 }
 op {
-  name: "Print"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "data"
-    type_list_attr: "U"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "U"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "message"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "first_n"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "summarize"
-    type: "int"
-    default_value {
-      i: 3
-    }
-  }
-  is_stateful: true
-}
-op {
   name: "PriorityQueue"
   output_arg {
     name: "handle"
@@ -30555,52 +30147,6 @@ op {
   }
 }
 op {
-  name: "ScanDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "initial_state"
-    type_list_attr: "Tstate"
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Tstate"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
   name: "ScatterAdd"
   input_arg {
     name: "ref"
@@ -32316,18 +31862,6 @@ op {
   }
 }
 op {
-  name: "SerializeIterator"
-  input_arg {
-    name: "resource_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "serialized"
-    type: DT_VARIANT
-  }
-  is_stateful: true
-}
-op {
   name: "SerializeManySparse"
   input_arg {
     name: "sparse_indices"
@@ -37732,38 +37266,6 @@ op {
   }
 }
 op {
-  name: "SqlDataset"
-  input_arg {
-    name: "driver_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "data_source_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "query"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
   name: "Sqrt"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index f512213964..8f5d8308a3 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -141,16 +141,6 @@ count: A scalar representing the number of elements from the `input_dataset`
   that should be skipped.  If count is -1, skips everything.
 )doc");
 
-REGISTER_OP("IgnoreErrorsDataset")
-    .Input("input_dataset: variant")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that contains the elements of `input_dataset` ignoring errors.
-)doc");
-
 REGISTER_OP("MapDataset")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -184,32 +174,6 @@ num_parallel_calls: The number of concurrent invocations of `f` that process
   elements from `input_dataset` in parallel.
 )doc");
 
-REGISTER_OP("MapAndBatchDataset")
-    .Input("input_dataset: variant")
-    .Input("other_arguments: Targuments")
-    .Input("batch_size: int64")
-    .Input("num_parallel_batches: int64")
-    .Output("handle: variant")
-    .Attr("f: func")
-    .Attr("Targuments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that applies `f` to the outputs of `input_dataset` and then
-batches `batch_size` of them.
-
-Unlike a "MapDataset", which applies `f` sequentially, this dataset invokes up
-to `batch_size * num_parallel_batches` copies of `f` in parallel.
-
-batch_size: A scalar representing the number of elements to accumulate in a
-  batch. It determines the number of concurrent invocations of `f` that process
-  elements from `input_dataset` in parallel.
-num_parallel_batches: A scalar representing the number of batches to create in
-  parallel. Processing multiple batches in parallel benefits workloads prone to
-  stragglers.
-)doc");
-
 REGISTER_OP("PrefetchDataset")
     .Input("input_dataset: variant")
     .Input("buffer_size: int64")
@@ -224,21 +188,6 @@ buffer_size: The maximum number of elements to buffer in an iterator over
   this dataset.
 )doc");
 
-REGISTER_OP("ScanDataset")
-    .Input("input_dataset: variant")
-    .Input("initial_state: Tstate")
-    .Input("other_arguments: Targuments")
-    .Output("handle: variant")
-    .Attr("f: func")
-    .Attr("Tstate: list(type) >= 1")
-    .Attr("Targuments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset successively reduces `f` over the elements of `input_dataset`.
-)doc");
-
 REGISTER_OP("FlatMapDataset")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -285,59 +234,6 @@ f: A function mapping elements of `input_dataset`, concatenated with
   `output_types` and `output_shapes`.
 )doc");
 
-REGISTER_OP("ParallelInterleaveDataset")
-    .Input("input_dataset: variant")
-    .Input("other_arguments: Targuments")
-    .Input("cycle_length: int64")
-    .Input("block_length: int64")
-    .Input("sloppy: bool")
-    .Output("handle: variant")
-    .Attr("f: func")
-    .Attr("Targuments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that applies `f` to the outputs of `input_dataset`.
-
-The resulting dataset is similar to the `InterleaveDataset`, with the exception
-that if retrieving the next value from a dataset would cause the requester to
-block, it will skip that input dataset. This dataset is especially useful
-when loading data from a variable-latency datastores (e.g. HDFS, GCS), as it
-allows the training step to proceed so long as some data is available.
-
-!! WARNING !! This dataset is not deterministic!
-
-f: A function mapping elements of `input_dataset`, concatenated with
-   `other_arguments`, to a Dataset variant that contains elements matching
-   `output_types` and `output_shapes`.
-)doc");
-
-REGISTER_OP("GroupByWindowDataset")
-    .Input("input_dataset: variant")
-    .Input("key_func_other_arguments: Tkey_func_other_arguments")
-    .Input("reduce_func_other_arguments: Treduce_func_other_arguments")
-    .Input(
-        "window_size_func_other_arguments: Twindow_size_func_other_arguments")
-    .Output("handle: variant")
-    .Attr("key_func: func")
-    .Attr("reduce_func: func")
-    .Attr("window_size_func: func")
-    .Attr("Tkey_func_other_arguments: list(type) >= 0")
-    .Attr("Treduce_func_other_arguments: list(type) >= 0")
-    .Attr("Twindow_size_func_other_arguments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that computes a windowed group-by on `input_dataset`.
-
-// TODO(mrry): Support non-int64 keys.
-
-key_func: A function mapping an element of `input_dataset`, concatenated
-  with `key_func_other_arguments` to a scalar value of type DT_INT64.
-)doc");
-
 REGISTER_OP("FilterDataset")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -408,27 +304,6 @@ padding_values: A list of scalars containing the padding value to use for
   each of the outputs.
 )doc");
 
-REGISTER_OP("DenseToSparseBatchDataset")
-    .Input("input_dataset: variant")
-    .Input("batch_size: int64")
-    .Input("row_shape: int64")
-    .Output("handle: variant")
-    // NOTE(mrry): the 0th and 2nd elements will be DT_INT64.
-    .Attr("output_types: list(type) >= 1")
-    // NOTE(mrry): the 1st and 2nd elements will be vectors.
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that yields a SparseTensor for each element of the input.
-
-input_dataset: A handle to an input dataset. Must have a single component.
-batch_size: A scalar representing the number of elements to accumulate in a
-  batch.
-row_shape: A vector representing the dense shape of each row in the produced
-  SparseTensor. The shape may be partially specified, using `-1` to indicate
-  that a particular dimension should use the maximum size of all batch elements.
-)doc");
-
 REGISTER_OP("RangeDataset")
     .Input("start: int64")
     .Input("stop: int64")
@@ -514,24 +389,6 @@ compression_type: A scalar containing either (i) the empty string (no
 buffer_size: A scalar containing the number of bytes to buffer.
 )doc");
 
-REGISTER_OP("SqlDataset")
-    .Input("driver_name: string")
-    .Input("data_source_name: string")
-    .Input("query: string")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that executes a SQL query and emits rows of the result set.
-
-driver_name: The database type. Currently, the only supported type is 'sqlite'.
-data_source_name: A connection string to connect to the database.
-query: A SQL query to execute.
-)doc");
-
 REGISTER_OP("FixedLengthRecordDataset")
     .Input("filenames: string")
     .Input("header_bytes: int64")
@@ -662,36 +519,6 @@ REGISTER_OP("IteratorGetNext")
 Gets the next output from the given iterator.
 )doc");
 
-REGISTER_OP("DatasetToSingleElement")
-    .Input("dataset: variant")
-    .Output("components: output_types")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
-      std::vector<PartialTensorShape> output_shapes;
-      TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
-      if (output_shapes.size() != c->num_outputs()) {
-        return errors::InvalidArgument(
-            "`output_shapes` must be the same length as `output_types` (",
-            output_shapes.size(), " vs. ", c->num_outputs());
-      }
-      for (size_t i = 0; i < output_shapes.size(); ++i) {
-        shape_inference::ShapeHandle output_shape_handle;
-        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
-            output_shapes[i], &output_shape_handle));
-        c->set_output(static_cast<int>(i), output_shape_handle);
-      }
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Outputs the single element from the given dataset.
-
-dataset: A handle to a dataset that contains a single element.
-components: The components of the single element of `input`.
-)doc");
-
 REGISTER_OP("IteratorToStringHandle")
     .Input("resource_handle: resource")
     .Output("string_handle: string")
@@ -720,28 +547,4 @@ output_shapes: If specified, defines the shape of each tuple component in an
   element produced by the resulting iterator.
 )doc");
 
-REGISTER_OP("SerializeIterator")
-    .Input("resource_handle: resource")
-    .Output("serialized: variant")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Converts the given `resource_handle` representing an iterator to a variant tensor.
-
-resource_handle: A handle to an iterator resource.
-serialized: A variant tensor storing the state of the iterator contained in the
-  resource.
-)doc");
-
-REGISTER_OP("DeserializeIterator")
-    .Input("resource_handle: resource")
-    .Input("serialized: variant")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Converts the given variant tensor to an iterator and stores it in the given resource.
-
-resource_handle: A handle to an iterator resource.
-serialized: A variant tensor storing the state of the iterator contained in the
-  resource.
-)doc");
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/logging_ops.cc b/tensorflow/core/ops/logging_ops.cc
index e6995821df..11cb9861a3 100644
--- a/tensorflow/core/ops/logging_ops.cc
+++ b/tensorflow/core/ops/logging_ops.cc
@@ -43,7 +43,7 @@ REGISTER_OP("Print")
     .Output("output: T")
     .SetIsStateful()
     .Attr("T: type")
-    .Attr("U: list(type) >= 0")
+    .Attr("U: list(type)")
     .Attr("message: string = ''")
     .Attr("first_n: int = -1")
     .Attr("summarize: int = 3")
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 7b10af9f44..d30b847696 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1829,6 +1829,8 @@ need not be sorted and need not cover all values in the full
 range of valid values.
 
 If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+If the given segment ID `i` is negative, the value is dropped and will not be
+added to the sum of the segment.
 
 `num_segments` should equal the number of distinct segment IDs.
 
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index de059a3e7e..a3609372a9 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -819,7 +819,7 @@ REGISTER_OP("DepthwiseConv2dNative")
     .Input("input: T")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {half, float, double}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
@@ -945,7 +945,7 @@ REGISTER_OP("Conv3D")
     .Input("input: T")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {half, float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
@@ -977,7 +977,7 @@ REGISTER_OP("Conv3DBackpropInput")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {half, float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Deprecated(10, "Use Conv3DBackpropInputV2")
@@ -1003,7 +1003,7 @@ REGISTER_OP("Conv3DBackpropFilter")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {half, float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Deprecated(10, "Use Conv3DBackpropFilterV2")
@@ -1032,7 +1032,7 @@ REGISTER_OP("Conv3DBackpropInputV2")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {half, float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
@@ -1069,7 +1069,7 @@ REGISTER_OP("Conv3DBackpropFilterV2")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {half, float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 2a74c20707..2c73441e7d 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -5261,6 +5261,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -5327,6 +5328,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -5382,6 +5384,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -5447,6 +5450,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -5502,6 +5506,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -6059,32 +6064,6 @@ op {
   description: "By default, this op performs an inclusive cumsum, which means that the first\nelement of the input is identical to the first element of the output:\n\n```python\ntf.cumsum([a, b, c])  # => [a, a + b, a + b + c]\n```\n\nBy setting the `exclusive` kwarg to `True`, an exclusive cumsum is\nperformed instead:\n\n```python\ntf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]\n```\n\nBy setting the `reverse` kwarg to `True`, the cumsum is performed in the\nopposite direction:\n\n```python\ntf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]\n```\n\nThis is more efficient than using separate `tf.reverse` ops.\n\nThe `reverse` and `exclusive` kwargs can also be combined:\n\n```python\ntf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]\n```"
 }
 op {
-  name: "DatasetToSingleElement"
-  input_arg {
-    name: "dataset"
-    description: "A handle to a dataset that contains a single element."
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "components"
-    description: "The components of the single element of `input`."
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  summary: "Outputs the single element from the given dataset."
-}
-op {
   name: "DebugGradientIdentity"
   input_arg {
     name: "input"
@@ -6716,41 +6695,6 @@ op {
   description: "See SetOperationOp::SetOperationFromContext for values of `set_operation`.\n\nOutput `result` is a `SparseTensor` represented by `result_indices`,\n`result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this\nhas rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`\ndimension contains the result of `set_operation` applied to the corresponding\n`[0...n-1]` dimension of `set`."
 }
 op {
-  name: "DenseToSparseBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    description: "A handle to an input dataset. Must have a single component."
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    description: "A scalar representing the number of elements to accumulate in a\nbatch."
-    type: DT_INT64
-  }
-  input_arg {
-    name: "row_shape"
-    description: "A vector representing the dense shape of each row in the produced\nSparseTensor. The shape may be partially specified, using `-1` to indicate\nthat a particular dimension should use the maximum size of all batch elements."
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  summary: "Creates a dataset that yields a SparseTensor for each element of the input."
-}
-op {
   name: "DenseToSparseSetOperation"
   input_arg {
     name: "set1"
@@ -7090,21 +7034,6 @@ op {
   description: "[min_range, max_range] are scalar floats that specify the range for\nthe \'input\' data. The \'mode\' attribute controls exactly which calculations are\nused to convert the float values to their quantized equivalents.\n\nIn \'MIN_COMBINED\' mode, each value of the tensor will undergo the following:\n\n```\nif T == qint8, in[i] += (range(T) + 1)/ 2.0\nout[i] = min_range + (in[i]* (max_range - min_range) / range(T))\n```\nhere `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`\n\n*MIN_COMBINED Mode Example*\n\nIf the input comes from a QuantizedRelu6, the output type is\nquint8 (range of 0-255) but the possible range of QuantizedRelu6 is\n0-6.  The min_range and max_range values are therefore 0.0 and 6.0.\nDequantize on quint8 will take each value, cast to float, and multiply\nby 6 / 255.\nNote that if quantizedtype is qint8, the operation will additionally add\neach value by 128 prior to casting.\n\nIf the mode is \'MIN_FIRST\', then this approach is used:\n\n```c++\nnum_discrete_values = 1 << (# of bits in T)\nrange_adjust = num_discrete_values / (num_discrete_values - 1)\nrange = (range_max - range_min) * range_adjust\nrange_scale = range / num_discrete_values\nconst double offset_input = static_cast<double>(input) - lowest_quantized;\nresult = range_min + ((input - numeric_limits<T>::min()) * range_scale)\n```\n\n*SCALED mode Example*\n\n`SCALED` mode matches the quantization approach used in\n`QuantizeAndDequantize{V2|V3}`.\n\nIf the mode is `SCALED`, we do not use the full range of the output type,\nchoosing to elide the lowest possible value for symmetry (e.g., output range is\n-127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to\n0.\n\nWe first find the range of values in our tensor. The\nrange we use is always centered on 0, so we find m such that\n```c++\n  m = max(abs(input_min), abs(input_max))\n```\n\nOur input tensor range is then `[-m, m]`.\n\nNext, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.\nIf T is signed, this is\n```\n  num_bits = sizeof(T) * 8\n  [min_fixed, max_fixed] =\n      [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]\n```\n\nOtherwise, if T is unsigned, the fixed-point range is\n```\n  [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]\n```\n\nFrom this we compute our scaling factor, s:\n```c++\n  s = (2 * m) / (max_fixed - min_fixed)\n```\n\nNow we can dequantize the elements of our tensor:\n```c++\nresult = input * s\n```"
 }
 op {
-  name: "DeserializeIterator"
-  input_arg {
-    name: "resource_handle"
-    description: "A handle to an iterator resource."
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "serialized"
-    description: "A variant tensor storing the state of the iterator contained in the\nresource."
-    type: DT_VARIANT
-  }
-  summary: "Converts the given variant tensor to an iterator and stores it in the given resource."
-  is_stateful: true
-}
-op {
   name: "DeserializeManySparse"
   input_arg {
     name: "serialized_sparse"
@@ -10219,71 +10148,6 @@ op {
   description: "*NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
-  name: "GroupByWindowDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "window_size_func_other_arguments"
-    type_list_attr: "Twindow_size_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-    description: "A function mapping an element of `input_dataset`, concatenated\nwith `key_func_other_arguments` to a scalar value of type DT_INT64."
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "window_size_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Twindow_size_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  summary: "Creates a dataset that computes a windowed group-by on `input_dataset`."
-  description: "// TODO(mrry): Support non-int64 keys."
-}
-op {
   name: "HSVToRGB"
   input_arg {
     name: "images"
@@ -10744,30 +10608,6 @@ op {
   description: "The upper regularized incomplete Gamma function is defined as:\n\n\\\\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\\\)\n\nwhere\n\n\\\\(Gamma(a, x) = int_{x}^{\\infty} t^{a-1} exp(-t) dt\\\\)\n\nis the upper incomplete Gama function.\n\nNote, above `P(a, x)` (`Igamma`) is the lower regularized complete\nGamma function."
 }
 op {
-  name: "IgnoreErrorsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  summary: "Creates a dataset that contains the elements of `input_dataset` ignoring errors."
-}
-op {
   name: "Imag"
   input_arg {
     name: "input"
@@ -12539,54 +12379,6 @@ op {
   is_stateful: true
 }
 op {
-  name: "MapAndBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "batch_size"
-    description: "A scalar representing the number of elements to accumulate in a\nbatch. It determines the number of concurrent invocations of `f` that process\nelements from `input_dataset` in parallel."
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_parallel_batches"
-    description: "A scalar representing the number of batches to create in\nparallel. Processing multiple batches in parallel benefits workloads prone to\nstragglers."
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset` and then"
-  description: "batches `batch_size` of them.\n\nUnlike a \"MapDataset\", which applies `f` sequentially, this dataset invokes up\nto `batch_size * num_parallel_batches` copies of `f` in parallel."
-}
-op {
   name: "MapClear"
   attr {
     name: "capacity"
@@ -16257,57 +16049,6 @@ op {
   description: "Builds a merged tensor such that\n\n```python\n    merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]\n```\n\nFor example, if each `indices[m]` is scalar or vector, we have\n\n```python\n    # Scalar indices:\n    merged[indices[m], ...] = data[m][...]\n\n    # Vector indices:\n    merged[indices[m][i], ...] = data[m][i, ...]\n```\n\nEach `data[i].shape` must start with the corresponding `indices[i].shape`,\nand the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we\nmust have `data[i].shape = indices[i].shape + constant`.  In terms of this\n`constant`, the output shape is\n\n    merged.shape = [max(indices)] + constant\n\nValues may be merged in parallel, so if an index appears in both `indices[m][i]`\nand `indices[n][j]`, the result may be invalid. This differs from the normal\nDynamicStitch operator that defines the behavior in that case.\n\nFor example:\n\n```python\n    indices[0] = 6\n    indices[1] = [4, 1]\n    indices[2] = [[5, 2], [0, 3]]\n    data[0] = [61, 62]\n    data[1] = [[41, 42], [11, 12]]\n    data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]\n    merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],\n              [51, 52], [61, 62]]\n```\n\nThis method can be used to merge partitions created by `dynamic_partition`\nas illustrated on the following example:\n\n```python\n    # Apply function (increments x_i) on elements for which a certain condition\n    # apply (x_i != -1 in this example).\n    x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])\n    condition_mask=tf.not_equal(x,tf.constant(-1.))\n    partitioned_data = tf.dynamic_partition(\n        x, tf.cast(condition_mask, tf.int32) , 2)\n    partitioned_data[1] = partitioned_data[1] + 1.0\n    condition_indices = tf.dynamic_partition(\n        tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)\n    x = tf.dynamic_stitch(condition_indices, partitioned_data)\n    # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain\n    # unchanged.\n```\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/DynamicStitch.png\" alt>\n</div>"
 }
 op {
-  name: "ParallelInterleaveDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "cycle_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "block_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sloppy"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-    description: "A function mapping elements of `input_dataset`, concatenated with\n`other_arguments`, to a Dataset variant that contains elements matching\n`output_types` and `output_shapes`."
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
-  description: "The resulting dataset is similar to the `InterleaveDataset`, with the exception\nthat if retrieving the next value from a dataset would cause the requester to\nblock, it will skip that input dataset. This dataset is especially useful\nwhen loading data from a variable-latency datastores (e.g. HDFS, GCS), as it\nallows the training step to proceed so long as some data is available.\n\n!! WARNING !! This dataset is not deterministic!"
-}
-op {
   name: "ParallelMapDataset"
   input_arg {
     name: "input_dataset"
@@ -16977,6 +16718,7 @@ op {
     name: "U"
     type: "list(type)"
     has_minimum: true
+    minimum: 1
   }
   attr {
     name: "message"
@@ -24114,53 +23856,6 @@ op {
   description: "The input `tags` and `values` must have the same shape.  The generated summary\nhas a summary value for each tag-value pair in `tags` and `values`."
 }
 op {
-  name: "ScanDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "initial_state"
-    type_list_attr: "Tstate"
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Tstate"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  summary: "Creates a dataset successively reduces `f` over the elements of `input_dataset`."
-}
-op {
   name: "ScatterAdd"
   input_arg {
     name: "ref"
@@ -25355,21 +25050,6 @@ op {
   summary: "Computes gradients for the scaled exponential linear (Selu) operation."
 }
 op {
-  name: "SerializeIterator"
-  input_arg {
-    name: "resource_handle"
-    description: "A handle to an iterator resource."
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "serialized"
-    description: "A variant tensor storing the state of the iterator contained in the\nresource."
-    type: DT_VARIANT
-  }
-  summary: "Converts the given `resource_handle` representing an iterator to a variant tensor."
-  is_stateful: true
-}
-op {
   name: "SerializeManySparse"
   input_arg {
     name: "sparse_indices"
@@ -29280,42 +28960,6 @@ op {
   summary: "Splits a tensor into `num_split` tensors along one dimension."
 }
 op {
-  name: "SqlDataset"
-  input_arg {
-    name: "driver_name"
-    description: "The database type. Currently, the only supported type is \'sqlite\'."
-    type: DT_STRING
-  }
-  input_arg {
-    name: "data_source_name"
-    description: "A connection string to connect to the database."
-    type: DT_STRING
-  }
-  input_arg {
-    name: "query"
-    description: "A SQL query to execute."
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  summary: "Creates a dataset that executes a SQL query and emits rows of the result set."
-  is_stateful: true
-}
-op {
   name: "Sqrt"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/summary_ops.cc b/tensorflow/core/ops/summary_ops.cc
index 5efbac7ad7..f778b48797 100644
--- a/tensorflow/core/ops/summary_ops.cc
+++ b/tensorflow/core/ops/summary_ops.cc
@@ -49,33 +49,6 @@ flush_millis: How often, in milliseconds, to flush the pending events and
 filename_suffix: Every event file's name is suffixed with this suffix.
 )doc");
 
-REGISTER_OP("CreateSummaryDbWriter")
-    .Input("writer: resource")
-    .Input("db_uri: string")
-    .Input("experiment_name: string")
-    .Input("run_name: string")
-    .Input("user_name: string")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Creates summary database writer accessible by given resource handle.
-
-This can be used to write tensors from the execution graph directly
-to a database. Only SQLite is supported right now. This function
-will create the schema if it doesn't exist. Entries in the Users,
-Experiments, and Runs tables will be created automatically if they
-don't already exist.
-
-writer: Handle to SummaryWriter resource to overwrite.
-db_uri: For example "file:/tmp/foo.sqlite".
-experiment_name: Can't contain ASCII control characters or <>. Case
-  sensitive. If empty, then the Run will not be associated with any
-  Experiment.
-run_name: Can't contain ASCII control characters or <>. Case sensitive.
-  If empty, then each Tag will not be associated with any Run.
-user_name: Must be valid as both a DNS label and Linux username. If
-  empty, then the Experiment will not be associated with any User.
-)doc");
-
 REGISTER_OP("FlushSummaryWriter")
     .Input("writer: resource")
     .SetShapeFn(shape_inference::NoOutputs)
@@ -116,20 +89,6 @@ summary_metadata: Serialized SummaryMetadata protocol buffer containing
  plugin-related metadata for this summary.
 )doc");
 
-REGISTER_OP("ImportEvent")
-    .Input("writer: resource")
-    .Input("event: string")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Outputs a `tf.Event` protocol buffer.
-
-When CreateSummaryDbWriter is being used, this op can be useful for
-importing data from event logs.
-
-writer: A handle to a summary writer.
-event: A string containing a binary-encoded tf.Event proto.
-)doc");
-
 REGISTER_OP("WriteScalarSummary")
     .Input("writer: resource")
     .Input("global_step: int64")
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 5eeb861bdd..6225c2c705 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -458,25 +458,16 @@ def tf_additional_lib_deps():
 
 def tf_additional_core_deps():
   return select({
-      "//tensorflow:with_gcp_support_windows_override": [],
-      "//tensorflow:with_gcp_support_android_override": [],
-      "//tensorflow:with_gcp_support_ios_override": [],
       "//tensorflow:with_gcp_support": [
           "//tensorflow/core/platform/cloud:gcs_file_system",
       ],
       "//conditions:default": [],
   }) + select({
-      "//tensorflow:with_hdfs_support_windows_override": [],
-      "//tensorflow:with_hdfs_support_android_override": [],
-      "//tensorflow:with_hdfs_support_ios_override": [],
       "//tensorflow:with_hdfs_support": [
           "//tensorflow/core/platform/hadoop:hadoop_file_system",
       ],
       "//conditions:default": [],
   }) + select({
-      "//tensorflow:with_s3_support_windows_override": [],
-      "//tensorflow:with_s3_support_android_override": [],
-      "//tensorflow:with_s3_support_ios_override": [],
       "//tensorflow:with_s3_support": [
           "//tensorflow/core/platform/s3:s3_file_system",
       ],
@@ -486,9 +477,9 @@ def tf_additional_core_deps():
 # TODO(jart, jhseu): Delete when GCP is default on.
 def tf_additional_cloud_op_deps():
   return select({
-      "//tensorflow:with_gcp_support_windows_override": [],
-      "//tensorflow:with_gcp_support_android_override": [],
-      "//tensorflow:with_gcp_support_ios_override": [],
+      "//tensorflow:windows": [],
+      "//tensorflow:android": [],
+      "//tensorflow:ios": [],
       "//tensorflow:with_gcp_support": [
         "//tensorflow/contrib/cloud:bigquery_reader_ops_op_lib",
       ],
@@ -498,9 +489,9 @@ def tf_additional_cloud_op_deps():
 # TODO(jart, jhseu): Delete when GCP is default on.
 def tf_additional_cloud_kernel_deps():
   return select({
-      "//tensorflow:with_gcp_support_windows_override": [],
-      "//tensorflow:with_gcp_support_android_override": [],
-      "//tensorflow:with_gcp_support_ios_override": [],
+      "//tensorflow:windows": [],
+      "//tensorflow:android": [],
+      "//tensorflow:ios": [],
       "//tensorflow:with_gcp_support": [
         "//tensorflow/contrib/cloud/kernels:bigquery_reader_ops",
       ],
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index f746b15fee..f2fadb4558 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -12,6 +12,7 @@ load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
 load("@local_config_sycl//sycl:platform.bzl", "sycl_library_path")
+load("@local_config_sycl//sycl:build_defs.bzl", "if_ccpp")
 
 cc_library(
     name = "gtest",
@@ -194,17 +195,16 @@ cc_library(
 
 cc_library(
     name = "sycl",
-    data = [
+    data = if_ccpp([
         "@local_config_sycl//sycl:{}".format(sycl_library_path("ComputeCpp")),
-    ],
-    linkopts = select({
-        "//conditions:default": [
-            "-Wl,-rpath,../local_config_sycl/sycl/lib",
-        ],
-    }),
-    deps = [
-        "@local_config_sycl//sycl:syclrt",
-    ],
+    ]),
+    linkopts = if_ccpp([
+        "-Wl,-rpath,../local_config_sycl/sycl/lib",
+    ]),
+    deps = if_ccpp(
+        ["@local_config_sycl//sycl:syclrt"],
+        ["@local_config_sycl//sycl:sycl_headers"],
+    ),
 )
 
 filegroup(
diff --git a/tensorflow/core/platform/default/notification.h b/tensorflow/core/platform/default/notification.h
index 6a214dbd0a..5c401b7477 100644
--- a/tensorflow/core/platform/default/notification.h
+++ b/tensorflow/core/platform/default/notification.h
@@ -73,7 +73,7 @@ class Notification {
   }
 
   mutex mu_;                    // protects mutations of notified_
-  condition_variable cv_;       // signalled when notified_ becomes non-zero
+  condition_variable cv_;       // signaled when notified_ becomes non-zero
   std::atomic<bool> notified_;  // mutations under mu_
 };
 
diff --git a/tensorflow/core/platform/posix/error.cc b/tensorflow/core/platform/posix/error.cc
index e9baad5422..f8b0285c50 100644
--- a/tensorflow/core/platform/posix/error.cc
+++ b/tensorflow/core/platform/posix/error.cc
@@ -72,7 +72,7 @@ error::Code ErrnoToCode(int err_number) {
     case EBUSY:       // Device or resource busy
     case ECHILD:      // No child processes
     case EISCONN:     // Socket is connected
-#if !defined(_WIN32)
+#if !defined(_WIN32) && !defined(__HAIKU__)
     case ENOTBLK:     // Block device required
 #endif
     case ENOTCONN:    // The socket is not connected
@@ -94,7 +94,7 @@ error::Code ErrnoToCode(int err_number) {
     case ENODATA:  // No message is available on the STREAM read queue
     case ENOMEM:   // Not enough space
     case ENOSR:    // No STREAM resources
-#if !defined(_WIN32)
+#if !defined(_WIN32) && !defined(__HAIKU__)
     case EUSERS:   // Too many users
 #endif
       code = error::RESOURCE_EXHAUSTED;
@@ -111,7 +111,7 @@ error::Code ErrnoToCode(int err_number) {
     case EPFNOSUPPORT:     // Protocol family not supported
 #endif
     case EPROTONOSUPPORT:  // Protocol not supported
-#if !defined(_WIN32)
+#if !defined(_WIN32) && !defined(__HAIKU__)
     case ESOCKTNOSUPPORT:  // Socket type not supported
 #endif
     case EXDEV:            // Improper link
@@ -131,7 +131,8 @@ error::Code ErrnoToCode(int err_number) {
     case ENETUNREACH:   // Network unreachable
     case ENOLCK:        // No locks available
     case ENOLINK:       // Link has been severed
-#if !(defined(__APPLE__) || defined(__FreeBSD__) || defined(_WIN32))
+#if !(defined(__APPLE__) || defined(__FreeBSD__) || defined(_WIN32) \
+	|| defined(__HAIKU__))
     case ENONET:  // Machine is not on the network
 #endif
       code = error::UNAVAILABLE;
@@ -156,7 +157,7 @@ error::Code ErrnoToCode(int err_number) {
     case ENOEXEC:      // Exec format error
     case ENOMSG:       // No message of the desired type
     case EPROTO:       // Protocol error
-#if !defined(_WIN32)
+#if !defined(_WIN32) && !defined(__HAIKU__)
     case EREMOTE:      // Object is remote
 #endif
       code = error::UNKNOWN;
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 6cba40ccfc..09f69a95c1 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -37,7 +37,8 @@ limitations under the License.
 #ifdef TF_USE_SNAPPY
 #include "snappy.h"
 #endif
-#if (defined(__APPLE__) && defined(__MACH__)) || defined(__FreeBSD__)
+#if (defined(__APPLE__) && defined(__MACH__)) || defined(__FreeBSD__) \
+	|| defined(__HAIKU__)
 #include <thread>
 #endif
 
@@ -61,7 +62,8 @@ int NumSchedulableCPUs() {
   }
   perror("sched_getaffinity");
 #endif
-#if (defined(__APPLE__) && defined(__MACH__)) || defined(__FreeBSD__)
+#if (defined(__APPLE__) && defined(__MACH__)) || defined(__FreeBSD__) \
+	|| defined(__HAIKU__)
   unsigned int count = std::thread::hardware_concurrency();
   if (count > 0) return static_cast<int>(count);
 #endif
diff --git a/tensorflow/core/platform/vmodule_benchmark_test.cc b/tensorflow/core/platform/vmodule_benchmark_test.cc
deleted file mode 100644
index 0f9e75bf9c..0000000000
--- a/tensorflow/core/platform/vmodule_benchmark_test.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/test_benchmark.h"
-
-namespace tensorflow {
-
-static void BM_DisabledVlog(int iters) {
-  for (int i = 0; i < iters; ++i) {
-    VLOG(1) << "Testing VLOG(1)!";
-  }
-}
-BENCHMARK(BM_DisabledVlog);
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/platform/vmodule_test.cc b/tensorflow/core/platform/vmodule_test.cc
deleted file mode 100644
index 47b4b2e0e7..0000000000
--- a/tensorflow/core/platform/vmodule_test.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Test that popens a child process with the VLOG-ing environment variable set
-// for the logging framework, and observes VLOG_IS_ON and VLOG macro output.
-
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/platform.h"
-#include "tensorflow/core/platform/test.h"
-
-#include <string.h>
-
-namespace tensorflow {
-namespace {
-
-int RealMain(const char* argv0, bool do_vlog) {
-  if (do_vlog) {
-#if !defined(PLATFORM_GOOGLE)
-    // Note, we only test this when !defined(PLATFORM_GOOGLE) because
-    // VmoduleActivated doesn't exist in that implementation.
-    //
-    // Also, we call this internal API to simulate what would happen if
-    // differently-named translation units attempted to VLOG, so we don't need
-    // to create dummy translation unit files.
-    bool ok = internal::LogMessage::VmoduleActivated("vmodule_test.cc", 7) &&
-              internal::LogMessage::VmoduleActivated("shoobadooba.h", 3);
-    if (!ok) {
-      fprintf(stderr, "vmodule activated levels not as expected.\n");
-      return EXIT_FAILURE;
-    }
-#endif
-
-    // Print info on which VLOG levels are activated.
-    fprintf(stderr, "VLOG_IS_ON(8)? %d\n", VLOG_IS_ON(8));
-    fprintf(stderr, "VLOG_IS_ON(7)? %d\n", VLOG_IS_ON(7));
-    fprintf(stderr, "VLOG_IS_ON(6)? %d\n", VLOG_IS_ON(6));
-    // Do some VLOG-ing.
-    VLOG(8) << "VLOG(8)";
-    VLOG(7) << "VLOG(7)";
-    VLOG(6) << "VLOG(6)";
-    LOG(INFO) << "INFO";
-    return EXIT_SUCCESS;
-  }
-
-  // Popen the child process.
-  std::string command = std::string(argv0);
-#if defined(PLATFORM_GOOGLE)
-  command = command + " do_vlog --vmodule=vmodule_test=7 --alsologtostderr";
-#else
-  command =
-      "TF_CPP_VMODULE=vmodule_test=7,shoobadooba=3 " + command + " do_vlog";
-#endif
-  command += " 2>&1";
-  fprintf(stderr, "Running: \"%s\"\n", command.c_str());
-  FILE* f = popen(command.c_str(), "r");
-  if (f == nullptr) {
-    fprintf(stderr, "Failed to popen child: %s\n", strerror(errno));
-    return EXIT_FAILURE;
-  }
-
-  // Read data from the child's stdout.
-  constexpr int kBufferSizeBytes = 4096;
-  char buffer[kBufferSizeBytes];
-  size_t result = fread(buffer, sizeof(buffer[0]), kBufferSizeBytes - 1, f);
-  if (result == 0) {
-    fprintf(stderr, "Failed to read from child stdout: %zu %s\n", result,
-            strerror(errno));
-    return EXIT_FAILURE;
-  }
-  buffer[result] = '\0';
-  int status = pclose(f);
-  if (status == -1) {
-    fprintf(stderr, "Failed to close popen child: %s\n", strerror(errno));
-    return EXIT_FAILURE;
-  }
-
-  // Check output is as expected.
-  const char kExpected[] =
-      "VLOG_IS_ON(8)? 0\nVLOG_IS_ON(7)? 1\nVLOG_IS_ON(6)? 1\n";
-  if (strstr(buffer, kExpected) == nullptr) {
-    fprintf(stderr, "error: unexpected output from child: \"%.*s\"\n",
-            kBufferSizeBytes, buffer);
-    return EXIT_FAILURE;
-  }
-  bool ok = strstr(buffer, "VLOG(7)\n") != nullptr &&
-            strstr(buffer, "VLOG(6)\n") != nullptr &&
-            strstr(buffer, "VLOG(8)\n") == nullptr;
-  if (!ok) {
-    fprintf(stderr, "error: VLOG output not as expected: \"%.*s\"\n",
-            kBufferSizeBytes, buffer);
-    return EXIT_FAILURE;
-  }
-
-  // Success!
-  return EXIT_SUCCESS;
-}
-
-}  // namespace
-}  // namespace tensorflow
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  bool do_vlog = argc >= 2 && strcmp(argv[1], "do_vlog") == 0;
-  return tensorflow::RealMain(argv[0], do_vlog);
-}
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 1bf9c93101..ec077c4283 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc1"
+#define TF_VERSION_SUFFIX ""
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/bcast.cc b/tensorflow/core/util/bcast.cc
index 1eab7e3d02..47e6ddb3d8 100644
--- a/tensorflow/core/util/bcast.cc
+++ b/tensorflow/core/util/bcast.cc
@@ -68,7 +68,9 @@ BCast::BCast(const Vec& sx, const Vec& sy, const bool fewer_dims_optimization) {
       // Output shape.
       State curr = UNKNOWN;
       const int64 x_i = x[i];  // i-th dimension of x.
+      CHECK_GE(x_i, 0);
       const int64 y_i = y[i];  // i-th dimension of y.
+      CHECK_GE(y_i, 0);
       int64 o_i;   // i-th dimension of the output.
       int64 bx_i;  // i-th broadcast for x.
       int64 by_i;  // i-th broadcast for y.
diff --git a/tensorflow/core/util/device_name_utils.cc b/tensorflow/core/util/device_name_utils.cc
index 90c3fed2e8..2d797c855a 100644
--- a/tensorflow/core/util/device_name_utils.cc
+++ b/tensorflow/core/util/device_name_utils.cc
@@ -116,6 +116,7 @@ bool DeviceNameUtils::ParseFullName(StringPiece fullname, ParsedName* p) {
   if (fullname == "/") {
     return true;
   }
+  StringPiece tmp;
   while (!fullname.empty()) {
     bool progress = false;
     if (str_util::ConsumePrefix(&fullname, "/job:")) {
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 1bfa4f83a3..118ff0d0d6 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -26,18 +26,23 @@ limitations under the License.
 #include "mkl_trans.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/graph/mkl_graph_util.h"
 
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
+
+using mkldnn::memory;
+using mkldnn::reorder;
+using mkldnn::primitive;
+using mkldnn::padding_kind;
+using mkldnn::engine;
 #endif
 
 // The file contains a number of utility classes and functions used by MKL
@@ -51,6 +56,8 @@ namespace tensorflow {
 // Tensorflow tensor.
 
 typedef enum { W = 0, H = 1, C = 2, N = 3 } MklDims;
+typedef enum { Dim_N = 0, Dim_C = 1, Dim_H = 2, Dim_W = 3,
+               Dim_O = 0, Dim_I = 1 } MklDnnDims;
 
 class MklShape {
  public:
@@ -143,7 +150,9 @@ class MklShape {
   size_t GetDimension() const { return dimension_; }
   const size_t* GetSizes() const { return sizes_; }
   int64 dim_size(int index) const { return sizes_[index]; }
-  int64 tf_dim_size(int index) const { return sizes_[tf_to_mkl_dim_map_[index]]; }
+  int64 tf_dim_size(int index) const {
+    return sizes_[tf_to_mkl_dim_map_[index]];
+  }
   const size_t* GetStrides() const { return strides_; }
   const size_t* GetTfToMklDimMap() const { return tf_to_mkl_dim_map_; }
   size_t tf_dim_idx(int index) const { return tf_to_mkl_dim_map_[index]; }
@@ -227,7 +236,8 @@ class MklShape {
   (IS_MKL_TENSOR_OFFSET + sizeof(size_t))  // Location of dimension_
 // Location of sizes. Note dim is not used here, left here
 // to make macros consistent.
-#define SIZES_OFFSET(dims) (DIMS_OFFSET + sizeof(size_t))
+#define SIZES_OFFSET(dims) \
+  (DIMS_OFFSET + sizeof(size_t))
 #define STRIDES_OFFSET(dims) \
   (SIZES_OFFSET(dims) + dims * sizeof(size_t))  // Location of strides
 #define MKL_LAYOUT_OFFSET(dims) \
@@ -309,6 +319,266 @@ class MklShape {
       nullptr;  // TF dimension corresponding to this MKL dimension
 };
 
+#ifdef INTEL_MKL_DNN
+
+// Forward decl
+TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format);
+
+class MklDnnShape {
+ private:
+  typedef struct {
+    /// Flag to indicate if the tensor is an  MKL tensor or not
+    bool is_mkl_tensor_ = false;
+    /// Number of dimensions in Tensorflow format
+    size_t dimension_ = 0;
+    /// Required by MKLDNN for conversions
+    mkldnn_dims_t sizes_;    // Required by MKL for conversions
+    memory::format tf_data_format_ = memory::format::format_undef;
+    memory::data_type T_ = memory::data_type::data_undef;
+    // MKL layout
+    mkldnn_memory_desc_t mkl_md_;
+    /// TF dimension corresponding to this MKL dimension
+    mkldnn_dims_t map_;
+  } MklShapeData;
+  MklShapeData data_;
+
+  typedef std::remove_extent<mkldnn_dims_t>::type mkldnn_dim_t;
+#define INVALID_DIM_SIZE -1
+
+
+ public:
+  MklDnnShape() {
+    for (size_t i = 0; i < sizeof(data_.sizes_) /
+                           sizeof(data_.sizes_[0]); ++i) {
+      data_.sizes_[i] = -1;
+    }
+    for (size_t i = 0; i < sizeof(data_.map_) /
+                           sizeof(data_.map_[0]); ++i) {
+      data_.map_[i] = -1;
+    }
+  }
+
+  ~MklDnnShape() {}
+  TF_DISALLOW_COPY_AND_ASSIGN(MklDnnShape);  // Cannot copy
+
+  inline const bool IsMklTensor() const { return data_.is_mkl_tensor_; }
+  inline void SetMklTensor(bool is_mkl_tensor) {
+    data_.is_mkl_tensor_ = is_mkl_tensor;
+  }
+
+  inline void SetDimensions(const size_t dimension) {
+    data_.dimension_ = dimension;
+  }
+  inline size_t GetDimension(char dimension)const {
+    int index = GetMklDnnTensorDimIndex(dimension);
+    CHECK(index >= 0 && index < this->GetDimension())
+        << "Invalid index from the dimension: " << index << ", " << dimension;
+    return this->DimSize(index);
+  }
+
+  inline int32 GetMklDnnTensorDimIndex(char dimension)const {
+    switch (dimension) {
+  case 'N':
+    return MklDnnDims::Dim_N;
+  case 'C':
+    return MklDnnDims::Dim_C;
+  case 'H':
+    return MklDnnDims::Dim_H;
+  case 'W':
+    return MklDnnDims::Dim_W;
+  default:
+    LOG(FATAL) << "Invalid dimension: " << dimension;
+    return -1;  // Avoid compiler warning about missing return value
+    }
+  }
+
+  inline size_t GetDimension() const { return data_.dimension_; }
+  inline const int* GetSizes() const {
+    return reinterpret_cast<const int*>(&data_.sizes_[0]);
+  }
+
+  // Returns an mkldnn::memory::dims object that contains the sizes of this
+  // MklDnnShape object.
+  inline memory::dims GetSizesAsMklDnnDims() const {
+    memory::dims retVal;
+    if (data_.is_mkl_tensor_) {
+      int dimensions = sizeof(data_.sizes_) / sizeof(data_.sizes_[0]);
+      for (size_t i = 0 ; i < dimensions; i++) {
+        if (data_.sizes_[i] != INVALID_DIM_SIZE)
+        retVal.push_back(data_.sizes_[i]);
+      }
+    } else {
+      CHECK_EQ(data_.is_mkl_tensor_, true);
+    }
+    return retVal;
+  }
+
+  inline int64 DimSize(int index) const {
+    CHECK_LT(index, sizeof(data_.sizes_)/sizeof(data_.sizes_[0]));
+    return data_.sizes_[index];
+  }
+
+  /// Return TensorShape that describes the Tensorflow shape of the tensor
+  /// represented by this MklShape.
+  inline TensorShape GetTfShape() {
+    CHECK_EQ(data_.is_mkl_tensor_, true);
+
+    std::vector<int32> shape(data_.dimension_, -1);
+    for (size_t idx = 0; idx < data_.dimension_; ++idx) {
+      shape[idx] = data_.sizes_[TfDimIdx(idx)];
+    }
+
+    TensorShape ts;
+    bool ret = TensorShapeUtils::MakeShape(shape, &ts).ok();
+    CHECK_EQ(ret, true);
+    return ts;
+  }
+
+  inline void SetElemType(memory::data_type dt) { data_.T_ = dt; }
+  inline const memory::data_type GetElemType() { return data_.T_; }
+
+  inline void SetMklLayout(memory::primitive_desc* pd) {
+    CHECK_NOTNULL(pd);
+    data_.mkl_md_ = pd->desc().data;
+  }
+  inline const memory::desc GetMklLayout() const {
+    return memory::desc(data_.mkl_md_);
+  }
+
+  inline memory::format GetTfDataFormat() const {
+    return data_.tf_data_format_;
+  }
+  /// We don't create primitive_descriptor for TensorFlow layout now.
+  /// We use lazy evaluation and create it only when needed.
+  inline void SetTfLayout(size_t dims, const memory::dims& sizes,
+                   memory::format format) {
+    CHECK_EQ(dims, sizes.size());
+    data_.dimension_ = dims;
+    for (size_t ii = 0; ii < dims; ii++) {
+      data_.sizes_[ii] = sizes[ii];
+    }
+    data_.tf_data_format_ = format;
+    SetTfDimOrder(dims, format);
+  }
+  inline const memory::desc GetTfLayout() const {
+    memory::dims dims;
+    for (size_t ii = 0; ii < data_.dimension_; ii++) {
+      dims.push_back(data_.sizes_[ii]);
+    }
+    return memory::desc(dims, data_.T_, data_.tf_data_format_);
+  }
+  inline const memory::desc GetCurLayout() const {
+    return IsMklTensor() ? GetMklLayout() : GetTfLayout();
+  }
+
+  // nhasabni - I've removed SetTfDimOrder that was setting default order in
+  // case of MKL-ML. We don't need a case of default dimension order because
+  // when an operator that does not get data_format attribute gets all inputs
+  // in Tensorflow format, it will produce output in Tensorflow format.
+  inline void SetTfDimOrder(const size_t dimension, const mkldnn_dims_t map) {
+    CHECK(dimension == data_.dimension_);
+    for (size_t ii = 0; ii < dimension; ii++) {
+      data_.map_[ii] = map[ii];
+    }
+  }
+
+  inline void SetTfDimOrder(const size_t dimension, TensorFormat data_format) {
+    // TODO(nhasabni): Why do we restrict this to 4D?
+    CHECK_EQ(dimension, 4);
+    CHECK(dimension == data_.dimension_);
+    data_.map_[GetTensorDimIndex<2>(data_format, 'W')] = MklDnnDims::Dim_W;
+    data_.map_[GetTensorDimIndex<2>(data_format, 'H')] = MklDnnDims::Dim_H;
+    data_.map_[GetTensorDimIndex<2>(data_format, 'C')] = MklDnnDims::Dim_C;
+    data_.map_[GetTensorDimIndex<2>(data_format, 'N')] = MklDnnDims::Dim_N;
+  }
+
+  inline void SetTfDimOrder(const size_t dimension, memory::format format) {
+    TensorFormat data_format = MklDnnDataFormatToTFDataFormat(format);
+    SetTfDimOrder(dimension, data_format);
+  }
+
+  inline const mkldnn_dim_t* GetTfToMklDimMap() const {
+    return &data_.map_[0];
+  }
+  inline size_t TfDimIdx(int index) const { return data_.map_[index]; }
+  inline int64 TfDimSize(int index) const {
+    return data_.sizes_[TfDimIdx(index)];
+  }
+
+  /// Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd'
+  /// corresponds to MKL's Channel dimension.
+  inline bool IsMklChannelDim(int d) const {
+    return TfDimIdx(d) == MklDnnDims::Dim_C;
+  }
+  /// Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd'
+  /// corresponds to MKL's Batch dimension.
+  inline bool IsMklBatchDim(int d) const {
+    return TfDimIdx(d) == MklDnnDims::Dim_N;
+  }
+  /// Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd'
+  /// corresponds to MKL's Width dimension.
+  inline bool IsMklWidthDim(int d) const {
+    return TfDimIdx(d) == MklDnnDims::Dim_W;
+  }
+  /// Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd'
+  /// corresponds to MKL's Height dimension.
+  inline bool IsMklHeightDim(int d) const {
+    return TfDimIdx(d) == MklDnnDims::Dim_H;
+  }
+
+  /// Check if the TF-Mkl dimension ordering map specifies if the input
+  /// tensor is in NCHW format.
+  inline bool IsTensorInNCHWFormat() const {
+    TensorFormat data_format = FORMAT_NCHW;
+    return (IsMklBatchDim(GetTensorDimIndex<2>(data_format, 'N')) &&
+            IsMklChannelDim(GetTensorDimIndex<2>(data_format, 'C')) &&
+            IsMklHeightDim(GetTensorDimIndex<2>(data_format, 'H')) &&
+            IsMklWidthDim(GetTensorDimIndex<2>(data_format, 'W')));
+  }
+
+  /// Check if the TF-Mkl dimension ordering map specifies if the input
+  /// tensor is in NHWC format.
+  inline bool IsTensorInNHWCFormat() const {
+    TensorFormat data_format = FORMAT_NHWC;
+    return (IsMklBatchDim(GetTensorDimIndex<2>(data_format, 'N')) &&
+            IsMklChannelDim(GetTensorDimIndex<2>(data_format, 'C')) &&
+            IsMklHeightDim(GetTensorDimIndex<2>(data_format, 'H')) &&
+            IsMklWidthDim(GetTensorDimIndex<2>(data_format, 'W')));
+  }
+
+  /// The following methods are used for serializing and de-serializing the
+  /// contents of the mklshape object.
+  /// The data is serialized in this order
+  /// is_mkl_tensor_ : dimension_ : sizes_ : map_: format_ : T_ : mkl_pd_;
+
+  /// Size of buffer to hold the serialized object, the size is computed by
+  /// following above mentioned order
+  inline size_t GetSerializeBufferSize() const {
+    return sizeof(MklShapeData);
+  }
+
+  void SerializeMklDnnShape(unsigned char* buf, size_t buf_size) const {
+    CHECK(buf_size >= GetSerializeBufferSize())
+        << "Buffer size is too small to SerializeMklDnnShape";
+    *reinterpret_cast<MklShapeData*>(buf) = data_;
+  }
+
+  void DeSerializeMklDnnShape(const unsigned char* buf, size_t buf_size) {
+    // Make sure buffer holds at least is_mkl_tensor_.
+    CHECK(buf_size >= sizeof(data_.is_mkl_tensor_))
+      << "Buffer size is too small in DeSerializeMklDnnShape";
+
+    const bool is_mkl_tensor = *reinterpret_cast<const bool*>(buf);
+    if (is_mkl_tensor) {  // If it is an MKL Tensor then read the rest
+      CHECK(buf_size >= GetSerializeBufferSize())
+        << "Buffer size is too small in DeSerializeMklDnnShape";
+      data_ = *reinterpret_cast<const MklShapeData*>(buf);
+    }
+  }
+};
+
+#endif
+
 // List of MklShape objects. Used in Concat/Split layers.
 typedef std::vector<MklShape> MklShapeList;
 
@@ -347,6 +617,36 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
   return output_tensor;
 }
 
+#ifdef INTEL_MKL_DNN
+template <typename T>
+inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
+                             const MklDnnShape& mkl_shape) {
+  Tensor output_tensor;
+  TensorShape output_shape;
+
+#if 0
+  // TODO(nhasabni): need to implement
+  for (size_t j = 0; j < mkl_shape.GetDimension(); j++) {
+    // Outermost to innermost dimension
+    output_shape.AddDim(mkl_shape.GetSizes()[mkl_shape.tf_dim_idx(j)]);
+  }
+
+  // Allocate output tensor.
+  context->allocate_temp(DataTypeToEnum<T>::v(), output_shape, &output_tensor);
+
+  dnnLayout_t output_layout = static_cast<dnnLayout_t>(mkl_shape.GetTfLayout());
+  void* input_buffer = const_cast<T*>(mkl_tensor.flat<T>().data());
+  void* output_buffer = const_cast<T*>(output_tensor.flat<T>().data());
+
+  if (mkl_tensor.NumElements() != 0) {
+    mkl_shape.GetConvertedFlatData(output_layout, input_buffer, output_buffer);
+  }
+#endif
+
+  return output_tensor;
+}
+#endif
+
 // Get the MKL shape from the second string tensor
 inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) {
   mklshape->DeSerializeMklShape(
@@ -359,6 +659,20 @@ inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) {
           sizeof(uint8));
 }
 
+#ifdef INTEL_MKL_DNN
+inline void GetMklShape(OpKernelContext* ctext, int n,
+                        MklDnnShape* mklshape) {
+  mklshape->DeSerializeMklDnnShape(
+      ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
+          .flat<uint8>()
+          .data(),
+      ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
+              .flat<uint8>()
+              .size() *
+          sizeof(uint8));
+}
+#endif
+
 // Gets the actual input
 inline const Tensor& MklGetInput(OpKernelContext* ctext, int n) {
   return ctext->input(GetTensorDataIndex(n, ctext->num_inputs()));
@@ -382,6 +696,27 @@ inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
   }
 }
 
+#ifdef INTEL_MKL_DNN
+/// Get shape of input tensor pointed by 'input_idx' in TensorShape format.
+/// If the input tensor is in MKL layout, then obtains TensorShape from
+/// MklShape.
+inline TensorShape GetTfShape(OpKernelContext* context,
+                              size_t input_idx) {
+  // Sanity check.
+  CHECK_NOTNULL(context);
+  CHECK_LT(input_idx, context->num_inputs());
+
+  MklDnnShape input_mkl_shape;
+  GetMklShape(context, input_idx, &input_mkl_shape);
+  if (input_mkl_shape.IsMklTensor()) {
+    return input_mkl_shape.GetTfShape();
+  } else {
+    const Tensor& t = MklGetInput(context, input_idx);
+    return t.shape();
+  }
+}
+#endif
+
 // Allocate the second output tensor that will contain
 // the MKL shape serialized
 inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
@@ -397,6 +732,23 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
       second_tensor->flat<uint8>().size() * sizeof(uint8));
 }
 
+#ifdef INTEL_MKL_DNN
+// Allocate the second output tensor that will contain
+// the MKL shape serialized
+inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
+                                      const MklDnnShape& mkl_shape) {
+  Tensor* second_tensor = nullptr;
+  TensorShape second_shape;
+  second_shape.AddDim(mkl_shape.GetSerializeBufferSize());
+  OP_REQUIRES_OK(ctext, ctext->allocate_output(
+                            GetTensorMetaDataIndex(n, ctext->num_outputs()),
+                            second_shape, &second_tensor));
+  mkl_shape.SerializeMklDnnShape(
+      second_tensor->flat<uint8>().data(),
+      second_tensor->flat<uint8>().size() * sizeof(uint8));
+}
+#endif
+
 // Allocate the output tensor, create a second output tensor that will contain
 // the MKL shape serialized
 inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
@@ -417,9 +769,43 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
       second_tensor->flat<uint8>().size() * sizeof(uint8));
 }
 
+#ifdef INTEL_MKL_DNN
+// Allocate the output tensor, create a second output tensor that will contain
+// the MKL shape serialized
+inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
+                                      Tensor** output,
+                                      const TensorShape& tf_shape,
+                                      const MklDnnShape& mkl_shape) {
+  Tensor* second_tensor = nullptr;
+  TensorShape second_shape;
+  second_shape.AddDim(mkl_shape.GetSerializeBufferSize());
+  OP_REQUIRES_OK(
+      ctext, ctext->allocate_output(GetTensorDataIndex(n, ctext->num_outputs()),
+                                    tf_shape, output));
+  OP_REQUIRES_OK(ctext, ctext->allocate_output(
+                            GetTensorMetaDataIndex(n, ctext->num_outputs()),
+                            second_shape, &second_tensor));
+  mkl_shape.SerializeMklDnnShape(
+      second_tensor->flat<uint8>().data(),
+      second_tensor->flat<uint8>().size() * sizeof(uint8));
+}
+#endif
+
 // Allocates a temp tensor and returns the data buffer for temporary storage.
 // Currently
-// we only support F32, will need to templatize if other types are added
+#ifdef INTEL_MKL_DNN
+template <typename T>
+inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
+                           const memory::primitive_desc& pd, void** buf_out) {
+  TensorShape tf_shape;
+
+  tf_shape.AddDim(pd.get_size() / sizeof(T) + 1);
+  OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::v(),
+                                                 tf_shape, tensor_out));
+  *buf_out = static_cast<void*>(tensor_out->flat<T>().data());
+}
+#endif
+
 inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
                            dnnLayout_t lt_buff, void** buf_out) {
   TensorShape tf_shape;
@@ -435,7 +821,7 @@ inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
 
 template <typename T>
 inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
-                           TensorShape tf_shape) {
+                              TensorShape tf_shape) {
   OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::v(),
                                                  tf_shape, tensor_out));
 }
@@ -669,6 +1055,8 @@ inline bool MklCompareShapes(const TensorShape* input_shape_0,
   return true;
 }
 
+// These functions do not compile with MKL-DNN since mkl.h is missing.
+// We may need to remove them later.
 // TODO(intel_tf): Remove this routine when faster MKL layout conversion is
 // out.
 inline void MklNHWCToNCHW(const Tensor& input, Tensor** output) {
@@ -707,18 +1095,11 @@ inline void MklNCHWToNHWC(const Tensor& input, Tensor** output) {
 
 #ifdef INTEL_MKL_DNN
 
-using mkldnn::engine;
-using mkldnn::memory;
-using mkldnn::padding_kind;
-using mkldnn::primitive;
-using mkldnn::reorder;
-
 /// Return MKL-DNN data type (memory::data_type) for input type T
 ///
 /// @input None
 /// @return memory::data_type corresponding to type T
-template <typename T>
-static memory::data_type MklDnnType();
+template<typename T> static memory::data_type MklDnnType();
 
 /// Instantiation for float type. Add similar instantiations for other
 /// type if needed.
@@ -733,15 +1114,26 @@ memory::data_type MklDnnType<float>() {
 /// @return: memory::format corresponding to TensorFlow data format;
 ///          Fails with an error if invalid data format.
 inline memory::format TFDataFormatToMklDnnDataFormat(TensorFormat format) {
-  if (format == FORMAT_NHWC)
-    return memory::format::nhwc;
-  else if (format == FORMAT_NCHW)
-    return memory::format::nchw;
-  TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
+  if (format == FORMAT_NHWC) return memory::format::nhwc;
+  else if (format == FORMAT_NCHW) return memory::format::nchw;
+  TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT,
+                     "Unsupported data format"));
   // Return to get rid of compiler warning
   return memory::format::format_undef;
 }
 
+/// Map MKL-DNN data format to TensorFlow's data format
+///
+/// @input: memory::format
+/// @return: Tensorflow data format corresponding to memory::format
+///          Fails with an error if invalid data format.
+inline TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format) {
+  if (format == memory::format::nhwc) return FORMAT_NHWC;
+  else if (format == memory::format::nchw) return FORMAT_NCHW;
+  TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT,
+                     "Unsupported data format"));
+}
+
 /// Map TensorShape object into memory::dims required by MKL-DNN
 ///
 /// This function will simply map input TensorShape into MKL-DNN dims
@@ -753,7 +1145,7 @@ inline memory::format TFDataFormatToMklDnnDataFormat(TensorFormat format) {
 /// @return memory::dims corresponding to TensorShape
 inline memory::dims TFShapeToMklDnnDims(const TensorShape& shape) {
   memory::dims dims(shape.dims());
-  for (unsigned int d = 0; d < shape.dims(); ++d) {
+  for (int d = 0; d < shape.dims(); ++d) {
     dims[d] = shape.dim_size(d);
   }
   return dims;
@@ -769,7 +1161,7 @@ inline memory::dims TFShapeToMklDnnDims(const TensorShape& shape) {
 /// @input TensorShape object in shape
 /// @return memory::dims in MKL-DNN required NCHW format
 inline memory::dims TFShapeToMklDnnDimsInNCHW(const TensorShape& shape,
-                                              TensorFormat format) {
+                                            TensorFormat format) {
   // Check validity of format.
   CHECK_NE(TFDataFormatToMklDnnDataFormat(format),
            memory::format::format_undef);
@@ -783,6 +1175,43 @@ inline memory::dims TFShapeToMklDnnDimsInNCHW(const TensorShape& shape,
   return memory::dims({n, c, h, w});
 }
 
+/// Map MklDnn memory::dims object into TensorShape object.
+///
+/// This function will simply map input shape in MKL-DNN memory::dims format
+/// in Tensorflow's TensorShape object by perserving dimension order.
+///
+/// @input MKL-DNN memory::dims object
+/// @output TensorShape corresponding to memory::dims
+inline TensorShape MklDnnDimsToTFShape(const memory::dims& dims) {
+  std::vector<int32> shape(dims.size(), -1);
+  for (int d = 0; d < dims.size(); d++) {
+    shape[d] = dims[d];
+  }
+
+  TensorShape ret;
+  CHECK_EQ(TensorShapeUtils::MakeShape(shape, &ret).ok(), true);
+  return ret;
+}
+
+/// Function to calculate strides given tensor shape in Tensorflow order
+/// E.g., if dims_tf_order is {1, 2, 3, 4}, then as per Tensorflow convention,
+/// dimesion with size 1 is outermost dimension; while dimension with size 4 is
+/// innermost dimension. So strides for this tensor would be {4 * 3 * 2,
+/// 4 * 3, 4, 1}, i.e., {24, 12, 4, 1}.
+///
+/// @input Tensorflow shape in memory::dims type
+/// @return memory::dims containing strides for the tensor.
+inline memory::dims CalculateTFStrides(const memory::dims& dims_tf_order) {
+  CHECK_GT(dims_tf_order.size(), 0);
+  memory::dims strides(dims_tf_order.size());
+  int last_dim_idx = dims_tf_order.size() - 1;
+  strides[last_dim_idx] = 1;
+  for (int d = last_dim_idx - 1; d >= 0; d--) {
+    strides[d] = strides[d + 1] * dims_tf_order[d + 1];
+  }
+  return strides;
+}
+
 inline padding_kind TFPaddingToMklDnnPadding(Padding pad) {
   // MKL-DNN only supports zero padding.
   return padding_kind::zero;
@@ -808,23 +1237,21 @@ class MklDnnData {
   const engine* cpu_engine_;
 
  public:
-  explicit MklDnnData(const engine* e)
-      : user_memory_(nullptr),
-        reorder_memory_(nullptr),
-        op_md_(nullptr),
-        cpu_engine_(e) {}
+  explicit MklDnnData(const engine* e) : user_memory_(nullptr),
+                                         reorder_memory_(nullptr),
+                                         op_md_(nullptr), cpu_engine_(e) {}
 
   ~MklDnnData() {
     cpu_engine_ = nullptr;  // We don't own this.
-    delete (user_memory_);
-    delete (reorder_memory_);
-    delete (op_md_);
+    delete(user_memory_);
+    delete(reorder_memory_);
+    delete(op_md_);
   }
 
-  void* GetTensorBuffer(const Tensor* tensor) {
+  inline void* GetTensorBuffer(const Tensor* tensor) const {
     CHECK_NOTNULL(tensor);
-    return const_cast<void*>(
-        static_cast<const void*>(tensor->flat<T>().data()));
+    return const_cast<void*>(static_cast<const void*>(
+              tensor->flat<T>().data()));
   }
 
   /// Set user memory primitive using specified dimensions, memory format and
@@ -835,35 +1262,83 @@ class MklDnnData {
   /// an operation. E.g., filter of Conv2D is of shape {1, 2, 3, 4}, and
   /// memory format HWIO, and the buffer that contains actual values is
   /// pointed by data_buffer.
-  void SetUsrMem(memory::dims dim, memory::format fm, void* data_buffer) {
-    CHECK_NOTNULL(data_buffer);
-    CHECK_NOTNULL(cpu_engine_);
-    // TODO(nhasabni): can we remove dynamic memory allocation?
-    user_memory_ =
-        new memory(memory::primitive_desc(
-                       memory::desc(dim, MklDnnType<T>(), fm), *cpu_engine_),
-                   data_buffer);
+  inline void SetUsrMem(const memory::dims& dim, memory::format fm,
+                        void* data_buffer = nullptr) {
+    auto md = memory::desc(dim, MklDnnType<T>(), fm);
+    SetUsrMem(md, data_buffer);
   }
 
-  void SetUsrMem(memory::dims dim, memory::format fm, const Tensor* tensor) {
+  inline void SetUsrMem(const memory::dims& dim, memory::format fm,
+                        const Tensor* tensor) {
     CHECK_NOTNULL(tensor);
     SetUsrMem(dim, fm, GetTensorBuffer(tensor));
   }
 
+  /// Helper function to create memory descriptor in Blocked format
+  ///
+  /// @input: Tensor dimensions
+  /// @input: strides corresponding to dimensions. One can use utility
+  ///         function such as CalculateTFStrides to compute strides
+  ///         for given dimensions.
+  /// @return: memory::desc object corresponding to blocked memory format
+  ///          for given dimensions and strides.
+  static inline memory::desc CreateBlockedMemDesc(const memory::dims& dim,
+      const memory::dims& strides) {
+    CHECK_EQ(dim.size(), strides.size());
+
+    // We have to construct memory descriptor in a C style. This is not at all
+    // ideal but MKLDNN does not offer any API to construct descriptor in
+    // blocked format except a copy constructor that accepts
+    // mkldnn_memory_desc_t.
+    mkldnn_memory_desc_t md;
+    md.primitive_kind = mkldnn_memory;
+    md.ndims = dim.size();
+    md.format = mkldnn_blocked;
+    md.data_type = memory::convert_to_c(MklDnnType<T>());
+
+    for (size_t i = 0; i < dim.size(); i++) {
+      md.layout_desc.blocking.block_dims[i] = 1;
+      md.layout_desc.blocking.strides[1][i] = 1;
+      md.layout_desc.blocking.strides[0][i] = strides[i];
+      md.layout_desc.blocking.padding_dims[i] = dim[i];
+      md.layout_desc.blocking.offset_padding_to_data[i] = 0;
+      md.dims[i] = dim[i];
+    }
+    md.layout_desc.blocking.offset_padding = 0;
+
+    return memory::desc(md);
+  }
+
+  /// A version of SetUsrMem call that allows user to create memory in blocked
+  /// format. So in addition to accepting dimensions, it also accepts strides.
+  /// This allows user to create memory for tensor in a format that is not
+  /// supported by MKLDNN. E.g., MKLDNN does not support tensor format for 6
+  /// dimensional tensor as a native format. But by using blocked format, a user
+  /// can create memory for 6D tensor.
+  inline void SetUsrMem(const memory::dims& dim, const memory::dims& strides,
+                        void* data_buffer = nullptr) {
+    CHECK_EQ(dim.size(), strides.size());
+    auto blocked_md = MklDnnData<T>::CreateBlockedMemDesc(dim, strides);
+    SetUsrMem(blocked_md, data_buffer);
+  }
+
+  inline void SetUsrMem(const memory::dims& dim, const memory::dims& strides,
+                        const Tensor* tensor) {
+    CHECK_NOTNULL(tensor);
+    SetUsrMem(dim, strides, GetTensorBuffer(tensor));
+  }
+
   /// A version of function to set user memory primitive that accepts memory
   /// descriptor directly, instead of accepting dimensions and format. This
   /// function is more generic that the one above, but the function above is
   /// sufficient in most cases.
-  void SetUsrMem(memory::desc md, void* data_buffer) {
-    CHECK_NOTNULL(data_buffer);
-    CHECK_NOTNULL(cpu_engine_);
-    // TODO(nhasabni): can we remove dynamic memory allocation?
-    user_memory_ =
-        new memory(memory::primitive_desc(md, *cpu_engine_), data_buffer);
+  inline void SetUsrMem(const memory::desc& md, void* data_buffer = nullptr) {
+    auto pd = memory::primitive_desc(md, *cpu_engine_);
+    SetUsrMem(pd, data_buffer);
   }
 
   /// A version of SetUsrMem with memory descriptor and tensor
-  void SetUsrMem(memory::desc md, const Tensor* tensor) {
+  inline void SetUsrMem(const memory::desc& md, const Tensor* tensor) {
     CHECK_NOTNULL(tensor);
     SetUsrMem(md, GetTensorBuffer(tensor));
   }
@@ -872,41 +1347,60 @@ class MklDnnData {
   /// descriptor directly, instead of accepting dimensions and format. This
   /// function is more generic that the one above, but the function above is
   /// sufficient in most cases.
-  void SetUsrMem(memory::primitive_desc pd, void* data_buffer) {
-    CHECK_NOTNULL(data_buffer);
+  inline void SetUsrMem(const memory::primitive_desc& pd,
+                        void* data_buffer = nullptr) {
     CHECK_NOTNULL(cpu_engine_);
     // TODO(nhasabni): can we remove dynamic memory allocation?
-    user_memory_ = new memory(pd, data_buffer);
+    if (data_buffer) {
+     user_memory_ = new memory(pd, data_buffer);
+    } else {
+      user_memory_ = new memory(pd);
+    }
   }
 
   /// A version of SetUsrMem with primitive descriptor and tensor
-  void SetUsrMem(memory::primitive_desc pd, const Tensor* tensor) {
+  inline void SetUsrMem(const memory::primitive_desc& pd,
+                        const Tensor* tensor) {
     CHECK_NOTNULL(tensor);
     SetUsrMem(pd, GetTensorBuffer(tensor));
   }
 
   /// Get function for user memory primitive.
-  const memory* GetUsrMem() const { return user_memory_; }
+  inline const memory* GetUsrMem() const { return user_memory_; }
 
   /// Get function for primitive descriptor of user memory primitive.
-  const memory::primitive_desc GetUsrMemPrimDesc() const {
+  inline const memory::primitive_desc GetUsrMemPrimDesc() const {
     CHECK_NOTNULL(user_memory_);
     return user_memory_->get_primitive_desc();
   }
 
   /// Get function for descriptor of user memory.
-  memory::desc GetUsrMemDesc() {
+  inline memory::desc GetUsrMemDesc() {
     // This is ugly. Why MKL-DNN does not provide desc() method of const type??
     const memory::primitive_desc pd = GetUsrMemPrimDesc();
     return const_cast<memory::primitive_desc*>(&pd)->desc();
   }
 
   /// Get function for data buffer of user memory primitive.
-  void* GetUsrMemDataHandle() const {
+  inline void* GetUsrMemDataHandle() const {
     CHECK_NOTNULL(user_memory_);
     return user_memory_->get_data_handle();
   }
 
+  /// Set function for data buffer of user memory primitive.
+  inline void* SetUsrMemDataHandle(void* data_buffer) {
+    CHECK_NOTNULL(user_memory_);
+    CHECK_NOTNULL(data_buffer);
+    return user_memory_->set_data_handle(data_buffer);
+  }
+
+  /// Set function for data buffer of user memory primitive.
+  inline void SetUsrMemDataHandle(const Tensor* tensor) {
+    CHECK_NOTNULL(user_memory_);
+    CHECK_NOTNULL(tensor);
+    user_memory_->set_data_handle(GetTensorBuffer(tensor));
+  }
+
   /// Get the memory primitive for input and output of an op. If inputs
   /// to an op require reorders, then this function returns memory primitive
   /// for reorder. Otherwise, it will return memory primitive for user memory.
@@ -915,7 +1409,7 @@ class MklDnnData {
   /// execute Conv2D, we need memory primitive for I and F. Buf if reorder is
   /// required for I and F (say I_r is reorder primitive for I; F_r is reorder
   /// primitive for F), then we need I_r and F_r to perform Conv2D.
-  const memory& GetOpMem() const {
+  inline const memory& GetOpMem() const {
     return reorder_memory_ ? *reorder_memory_ : *user_memory_;
   }
 
@@ -923,13 +1417,32 @@ class MklDnnData {
   /// format. E.g., For Conv2D, the dimensions would be same as user dimensions
   /// but memory::format would be mkldnn::any because we want MKL-DNN to choose
   /// best layout/format for given input dimensions.
-  void SetOpMemDesc(const memory::dims& dim, memory::format fm) {
+  inline void SetOpMemDesc(const memory::dims& dim, memory::format fm) {
     // TODO(nhasabni): can we remove dynamic memory allocation?
     op_md_ = new memory::desc(dim, MklDnnType<T>(), fm);
   }
 
   /// Get function for memory descriptor for an operation
-  const memory::desc& GetOpMemDesc() const { return *op_md_; }
+  inline const memory::desc& GetOpMemDesc() const { return *op_md_; }
+
+  /// Predicate that checks if we need to reorder user's memory into memory
+  /// pointed by op_pd.
+  ///
+  /// @input: op_pd - memory primitive descriptor of the given input of an
+  ///               operation
+  /// @return: true in case reorder of input is needed; false, otherwise.
+  inline bool IsReorderNeeded(const memory::primitive_desc& op_pd) const {
+    CHECK_NOTNULL(user_memory_);
+    return op_pd != user_memory_->get_primitive_desc();
+  }
+
+  /// Function to create a reorder from memory pointed by from to memory pointed
+  /// by to. Returns created primitive.
+  inline primitive CreateReorder(const memory* from, const memory* to) const {
+    CHECK_NOTNULL(from);
+    CHECK_NOTNULL(to);
+    return reorder(*from, *to);
+  }
 
   /// Function to handle input reordering
   ///
@@ -945,19 +1458,62 @@ class MklDnnData {
   ///               operation
   /// @input: net - net to which to add reorder primitive in case it is needed.
   /// @return: true in case reorder of input is needed; false, otherwise.
-  bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
-                           std::vector<primitive>* net) {
+  inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
+                                  std::vector<primitive>* net) {
     CHECK_NOTNULL(net);
     CHECK_NOTNULL(user_memory_);
-    if (op_pd != user_memory_->get_primitive_desc()) {
+    if (IsReorderNeeded(op_pd)) {
       // TODO(nhasabni): can we remove dynamic memory allocation?
       reorder_memory_ = new memory(op_pd);
-      net->push_back(reorder(*user_memory_, *reorder_memory_));
+      net->push_back(CreateReorder(user_memory_, reorder_memory_));
+      return true;
+    }
+    return false;
+  }
+
+  /// Overloaded version of above function that accepts memory buffer
+  /// where output of reorder needs to be stored.
+  ///
+  /// @input: op_pd - memory primitive descriptor of the given input of an
+  ///               operation
+  /// @reorder_data_handle - memory buffer where output of reorder needs to be
+  ///                        stored. Primitive does not check if buffer is
+  ///                        enough size to write.
+  /// @input: net - net to which to add reorder primitive in case it is needed.
+  /// @return: true in case reorder of input is needed; false, otherwise.
+  inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
+                                  void* reorder_data_handle,
+                                  std::vector<primitive>* net) {
+    CHECK_NOTNULL(net);
+    CHECK_NOTNULL(reorder_data_handle);
+    CHECK_NOTNULL(user_memory_);
+    if (IsReorderNeeded(op_pd)) {
+      // TODO(nhasabni): can we remove dynamic memory allocation?
+      reorder_memory_ = new memory(op_pd, reorder_data_handle);
+      net->push_back(CreateReorder(user_memory_, reorder_memory_));
       return true;
     }
     return false;
   }
 
+  /// Another overloaded version of CheckReorderToOpMem that accepts Tensor
+  /// where output of reorder needs to be stored.
+  ///
+  /// @input: op_pd - memory primitive descriptor of the given input of an
+  ///               operation
+  /// @reorder_tensor - Tensor whose buffer is to be used to store output of
+  ///                   reorder. Primitive does not check if buffer is
+  ///                   enough size to write.
+  /// @input: net - net to which to add reorder primitive in case it is needed.
+  /// @return: true in case reorder of input is needed; false, otherwise.
+  inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
+                                  Tensor* reorder_tensor,
+                                  std::vector<primitive>* net) {
+    CHECK_NOTNULL(net);
+    CHECK_NOTNULL(reorder_tensor);
+    return CheckReorderToOpMem(op_pd, GetTensorBuffer(reorder_tensor), net);
+  }
+
   /// Function to handle output reorder
   ///
   /// This function performs very similar functionality as input reordering
@@ -970,9 +1526,10 @@ class MklDnnData {
   ///
   /// @input memory primitive descriptor for the given output of an operation
   /// @return: true in case reorder of output is needed; false, otherwise.
-  bool PrepareReorderToUserMemIfReq(const memory::primitive_desc& op_pd) {
+  inline bool PrepareReorderToUserMemIfReq(
+      const memory::primitive_desc& op_pd) {
     CHECK_NOTNULL(user_memory_);
-    if (op_pd != user_memory_->get_primitive_desc()) {
+    if (IsReorderNeeded(op_pd)) {
       // TODO(nhasabni): can we remove dynamic memory allocation?
       reorder_memory_ = new memory(op_pd);
       return true;
@@ -987,11 +1544,11 @@ class MklDnnData {
   /// to the user-specified output buffer.
   ///
   /// @input: net - net to which to add reorder primitive
-  void InsertReorderToUserMem(std::vector<primitive>* net) {
+  inline void InsertReorderToUserMem(std::vector<primitive>* net) {
     CHECK_NOTNULL(net);
     CHECK_NOTNULL(user_memory_);
     CHECK_NOTNULL(reorder_memory_);
-    net->push_back(reorder(*reorder_memory_, *user_memory_));
+    net->push_back(CreateReorder(reorder_memory_, user_memory_));
   }
 };
 
diff --git a/tensorflow/core/util/mkl_util_test.cc b/tensorflow/core/util/mkl_util_test.cc
new file mode 100644
index 0000000000..6aef3d86e9
--- /dev/null
+++ b/tensorflow/core/util/mkl_util_test.cc
@@ -0,0 +1,92 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include "tensorflow/core/util/mkl_util.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+#ifdef INTEL_MKL_DNN
+
+TEST(MklUtilTest, MklDnnTfShape) {
+  auto cpu_engine = engine(engine::cpu, 0);
+  MklDnnData<float> a(&cpu_engine);
+
+  const int N = 1, C = 2, H = 3, W = 4;
+  memory::dims a_dims = {N, C, H, W};
+  MklDnnShape a_mkldnn_shape;
+  a_mkldnn_shape.SetMklTensor(true);
+  // Create TF layout in NCHW.
+  a_mkldnn_shape.SetTfLayout(a_dims.size(), a_dims, memory::format::nchw);
+  TensorShape a_tf_shape_nchw({N, C, H, W});
+  TensorShape a_tf_shape_nhwc({N, H, W, C});
+  TensorShape a_mkldnn_tf_shape = a_mkldnn_shape.GetTfShape();
+  // Check that returned shape is in NCHW format.
+  EXPECT_EQ(a_tf_shape_nchw, a_mkldnn_tf_shape);
+  EXPECT_NE(a_tf_shape_nhwc, a_mkldnn_tf_shape);
+
+  memory::dims b_dims = {N, C, H, W};
+  MklDnnShape b_mkldnn_shape;
+  b_mkldnn_shape.SetMklTensor(true);
+  // Create TF layout in NHWC.
+  b_mkldnn_shape.SetTfLayout(b_dims.size(), b_dims, memory::format::nhwc);
+  TensorShape b_tf_shape_nhwc({N, H, W, C});
+  TensorShape b_tf_shape_nchw({N, C, H, W});
+  TensorShape b_mkldnn_tf_shape = b_mkldnn_shape.GetTfShape();
+  // Check that returned shape is in NHWC format.
+  EXPECT_EQ(b_tf_shape_nhwc, b_mkldnn_tf_shape);
+  EXPECT_NE(b_tf_shape_nchw, b_mkldnn_tf_shape);
+}
+
+
+TEST(MklUtilTest, MklDnnBlockedFormatTest) {
+  // Let's create 2D tensor of shape {3, 4} with 3 being innermost dimension
+  // first (case 1) and then it being outermost dimension (case 2).
+  auto cpu_engine = engine(engine::cpu, 0);
+
+  // Setting for case 1
+  MklDnnData<float> a(&cpu_engine);
+  memory::dims dim1 = {3, 4};
+  memory::dims strides1 = {1, 3};
+  a.SetUsrMem(dim1, strides1);
+
+  memory::desc a_md1 = a.GetUsrMemDesc();
+  EXPECT_EQ(a_md1.data.ndims, 2);
+  EXPECT_EQ(a_md1.data.dims[0], 3);
+  EXPECT_EQ(a_md1.data.dims[1], 4);
+  EXPECT_EQ(a_md1.data.format, mkldnn_blocked);
+
+  // Setting for case 2
+  MklDnnData<float> b(&cpu_engine);
+  memory::dims dim2 = {3, 4};
+  memory::dims strides2 = {4, 1};
+  b.SetUsrMem(dim2, strides2);
+
+  memory::desc b_md2 = b.GetUsrMemDesc();
+  EXPECT_EQ(b_md2.data.ndims, 2);
+  EXPECT_EQ(b_md2.data.dims[0], 3);
+  EXPECT_EQ(b_md2.data.dims[1], 4);
+  EXPECT_EQ(b_md2.data.format, mkldnn_blocked);
+}
+
+#endif  // INTEL_MKL_DNN
+}  // namespace
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/docs_src/api_guides/python/threading_and_queues.md b/tensorflow/docs_src/api_guides/python/threading_and_queues.md
index ab95ce0af9..8ad4c4c075 100644
--- a/tensorflow/docs_src/api_guides/python/threading_and_queues.md
+++ b/tensorflow/docs_src/api_guides/python/threading_and_queues.md
@@ -3,7 +3,7 @@
 Note: In versions of TensorFlow before 1.2, we recommended using multi-threaded,
 queue-based input pipelines for performance. Beginning with TensorFlow 1.4,
 however, we recommend using the `tf.data` module instead. (See
-[Datasets](datasets) for details. In TensorFlow 1.2 and 1.3, the module was
+@{$datasets$Datasets} for details. In TensorFlow 1.2 and 1.3, the module was
 called `tf.contrib.data`.) The `tf.data` module offers an easier-to-use
 interface for constructing efficient input pipelines. Furthermore, we've stopped
 developing the old multi-threaded, queue-based input pipelines.  We've retained
diff --git a/tensorflow/docs_src/get_started/get_started.md b/tensorflow/docs_src/get_started/get_started.md
index 8409962744..be14ab4026 100644
--- a/tensorflow/docs_src/get_started/get_started.md
+++ b/tensorflow/docs_src/get_started/get_started.md
@@ -272,7 +272,7 @@ train = optimizer.minimize(loss)
 ```
 
 ```python
-sess.run(init) # reset values to incorrect defaults.
+sess.run(init) # reset variables to incorrect defaults.
 for i in range(1000):
   sess.run(train, {x: [1, 2, 3, 4], y: [0, -1, -2, -3]})
 
@@ -317,7 +317,7 @@ y_train = [0, -1, -2, -3]
 # training loop
 init = tf.global_variables_initializer()
 sess = tf.Session()
-sess.run(init) # reset values to wrong
+sess.run(init) # initialize variables with incorrect defaults.
 for i in range(1000):
   sess.run(train, {x: x_train, y: y_train})
 
@@ -383,7 +383,7 @@ train_input_fn = tf.estimator.inputs.numpy_input_fn(
 eval_input_fn = tf.estimator.inputs.numpy_input_fn(
     {"x": x_eval}, y_eval, batch_size=4, num_epochs=1000, shuffle=False)
 
-# We can invoke 1000 training steps by invoking the  method and passing the
+# We can invoke 1000 training steps by invoking the method and passing the
 # training data set.
 estimator.train(input_fn=input_fn, steps=1000)
 
diff --git a/tensorflow/docs_src/get_started/input_fn.md b/tensorflow/docs_src/get_started/input_fn.md
index 9d3af5d96a..0db5c6143a 100644
--- a/tensorflow/docs_src/get_started/input_fn.md
+++ b/tensorflow/docs_src/get_started/input_fn.md
@@ -191,7 +191,7 @@ import pandas as pd
 
 def get_input_fn_from_pandas(data_set, num_epochs=None, shuffle=True):
   return tf.estimator.inputs.pandas_input_fn(
-      x=pdDataFrame(...),
+      x=pd.DataFrame(...),
       y=pd.Series(...),
       num_epochs=num_epochs,
       shuffle=shuffle)
@@ -267,8 +267,8 @@ tf.logging.set_verbosity(tf.logging.INFO)
 
 Define the column names for the data set in `COLUMNS`. To distinguish features
 from the label, also define `FEATURES` and `LABEL`. Then read the three CSVs
-(@{tf.train},
-@{tf.test}, and
+([train](http://download.tensorflow.org/data/boston_train.csv),
+[test](http://download.tensorflow.org/data/boston_test.csv), and
 [predict](http://download.tensorflow.org/data/boston_predict.csv)) into _pandas_
 `DataFrame`s:
 
diff --git a/tensorflow/docs_src/get_started/monitors.md b/tensorflow/docs_src/get_started/monitors.md
deleted file mode 100644
index 5606e95365..0000000000
--- a/tensorflow/docs_src/get_started/monitors.md
+++ /dev/null
@@ -1,406 +0,0 @@
-# Logging and Monitoring Basics with tf.contrib.learn
-
-When training a model, it’s often valuable to track and evaluate progress in
-real time. In this tutorial, you’ll learn how to use TensorFlow’s logging
-capabilities and the `Monitor` API to audit the in-progress training of a neural
-network classifier for categorizing irises. This tutorial builds on the code
-developed in @{$estimator$tf.estimator Quickstart} so if you
-haven't yet completed that tutorial, you may want to explore it first,
-especially if you're looking for an intro/refresher on tf.contrib.learn basics.
-
-## Setup {#setup}
-
-For this tutorial, you'll be building upon the following code from
-@{$estimator$tf.estimator Quickstart}:
-
-```python
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-import numpy as np
-import tensorflow as tf
-
-# Data sets
-IRIS_TRAINING = os.path.join(os.path.dirname(__file__), "iris_training.csv")
-IRIS_TEST = os.path.join(os.path.dirname(__file__), "iris_test.csv")
-
-def main(unused_argv):
-    # Load datasets.
-    training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-        filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32)
-    test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-        filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float32)
-
-    # Specify that all features have real-value data
-    feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
-
-    # Build 3 layer DNN with 10, 20, 10 units respectively.
-    classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
-                                                hidden_units=[10, 20, 10],
-                                                n_classes=3,
-                                                model_dir="/tmp/iris_model")
-
-    # Fit model.
-    classifier.fit(x=training_set.data,
-                   y=training_set.target,
-                   steps=2000)
-
-    # Evaluate accuracy.
-    accuracy_score = classifier.evaluate(x=test_set.data,
-                                         y=test_set.target)["accuracy"]
-    print('Accuracy: {0:f}'.format(accuracy_score))
-
-    # Classify two new flower samples.
-    new_samples = np.array(
-        [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=float)
-    y = list(classifier.predict(new_samples, as_iterable=True))
-    print('Predictions: {}'.format(str(y)))
-
-if __name__ == "__main__":
-  tf.app.run()
-```
-
-Copy the above code into a file, and download the corresponding
-[training](http://download.tensorflow.org/data/iris_training.csv) and
-[test](http://download.tensorflow.org/data/iris_test.csv) data sets to the same
-directory.
-
-In the following sections, you'll progressively make updates to the above code
-to add logging and monitoring capabilities. Final code incorporating all updates
-is [available for download
-here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/monitors/iris_monitors.py).
-
-## Overview
-
-The @{$estimator$tf.estimator Quickstart tutorial} walked through
-how to implement a neural net classifier to categorize iris examples into one of
-three species.
-
-But when [the code](#setup) from this tutorial is run, the output contains no
-logging tracking how model training is progressing&mdash;only the results of the
-`print` statements that were included:
-
-```none
-Accuracy: 0.933333
-Predictions: [1 2]
-```
-
-Without any logging, model training feels like a bit of a black box; you can't
-see what's happening as TensorFlow steps through gradient descent, get a sense
-of whether the model is converging appropriately, or audit to determine whether
-[early stopping](https://en.wikipedia.org/wiki/Early_stopping) might be
-appropriate.
-
-One way to address this problem would be to split model training into multiple
-`fit` calls with smaller numbers of steps in order to evaluate accuracy more
-progressively. However, this is not recommended practice, as it greatly slows
-down model training. Fortunately, tf.contrib.learn offers another solution: a
-@{tf.contrib.learn.monitors$Monitor API} designed to help
-you log metrics and evaluate your model while training is in progress. In the
-following sections, you'll learn how to enable logging in TensorFlow, set up a
-ValidationMonitor to do streaming evaluations, and visualize your metrics using
-TensorBoard.
-
-## Enabling Logging with TensorFlow
-
-TensorFlow uses five different levels for log messages. In order of ascending
-severity, they are `DEBUG`, `INFO`, `WARN`, `ERROR`, and `FATAL`. When you
-configure logging at any of these levels, TensorFlow will output all log
-messages corresponding to that level and all levels of higher severity. For
-example, if you set a logging level of `ERROR`, you'll get log output containing
-`ERROR` and `FATAL` messages, and if you set a level of `DEBUG`, you'll get log
-messages from all five levels.
-
-By default, TensorFlow is configured at a logging level of `WARN`, but when
-tracking model training, you'll want to adjust the level to `INFO`, which will
-provide additional feedback as `fit` operations are in progress.
-
-Add the following line to the beginning of your code (right after your
-`import`s):
-
-```python
-tf.logging.set_verbosity(tf.logging.INFO)
-```
-
-Now when you run the code, you'll see additional log output like the following:
-
-```none
-INFO:tensorflow:loss = 1.18812, step = 1
-INFO:tensorflow:loss = 0.210323, step = 101
-INFO:tensorflow:loss = 0.109025, step = 201
-```
-
-With `INFO`-level logging, tf.contrib.learn automatically outputs [training-loss
-metrics](https://en.wikipedia.org/wiki/Loss_function) to stderr after every 100
-steps.
-
-## Configuring a ValidationMonitor for Streaming Evaluation
-
-Logging training loss is helpful to get a sense whether your model is
-converging, but what if you want further insight into what's happening during
-training? tf.contrib.learn provides several high-level `Monitor`s you can attach
-to your `fit` operations to further track metrics and/or debug lower-level
-TensorFlow operations during model training, including:
-
-Monitor             | Description
-------------------- | -----------
-`CaptureVariable`   | Saves a specified variable's values into a collection at every _n_ steps of training
-`PrintTensor`       | Logs a specified tensor's values at every _n_ steps of training
-`SummarySaver`      | Saves @{tf.Summary} [protocol buffers](https://developers.google.com/protocol-buffers/) for a given tensor using a @{tf.summary.FileWriter} at every _n_ steps of training
-`ValidationMonitor` | Logs a specified set of evaluation metrics at every _n_ steps of training, and, if desired, implements early stopping under certain conditions
-
-### Evaluating Every *N* Steps
-
-For the iris neural network classifier, while logging training loss, you might
-also want to simultaneously evaluate against test data to see how well the model
-is generalizing. You can accomplish this by configuring a `ValidationMonitor`
-with the test data (`test_set.data` and `test_set.target`), and setting how
-often to evaluate with `every_n_steps`. The default value of `every_n_steps` is
-`100`; here, set `every_n_steps` to `50` to evaluate after every 50 steps of
-model training:
-
-```python
-validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
-    test_set.data,
-    test_set.target,
-    every_n_steps=50)
-```
-
-Place this code right before the line instantiating the `classifier`.
-
-`ValidationMonitor`s rely on saved checkpoints to perform evaluation operations,
-so you'll want to modify instantiation of the `classifier` to add a
-@{tf.contrib.learn.RunConfig} that includes
-`save_checkpoints_secs`, which specifies how many seconds should elapse between
-checkpoint saves during training. Because the iris data set is quite small, and
-thus trains quickly, it makes sense to set `save_checkpoints_secs` to 1 (saving
-a checkpoint every second) to ensure a sufficient number of checkpoints:
-
-```python
-classifier = tf.contrib.learn.DNNClassifier(
-    feature_columns=feature_columns,
-    hidden_units=[10, 20, 10],
-    n_classes=3,
-    model_dir="/tmp/iris_model",
-    config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1))
-```
-
-NOTE: The `model_dir` parameter specifies an explicit directory
-(`/tmp/iris_model`) for model data to be stored; this directory path will be
-easier to reference later on than an autogenerated one. Each time you run the
-code, any existing data in `/tmp/iris_model` will be loaded, and model training
-will continue where it left off in the last run (e.g., running the script twice
-in succession will execute 4000 steps during training&mdash;2000 during each
-`fit` operation). To start over model training from scratch, delete
-`/tmp/iris_model` before running the code.
-
-Finally, to attach your `validation_monitor`, update the `fit` call to include a
-`monitors` param, which takes a list of all monitors to run during model
-training:
-
-```python
-classifier.fit(x=training_set.data,
-               y=training_set.target,
-               steps=2000,
-               monitors=[validation_monitor])
-```
-
-Now, when you rerun the code, you should see validation metrics in your log
-output, e.g.:
-
-```none
-INFO:tensorflow:Validation (step 50): loss = 1.71139, global_step = 0, accuracy = 0.266667
-...
-INFO:tensorflow:Validation (step 300): loss = 0.0714158, global_step = 268, accuracy = 0.966667
-...
-INFO:tensorflow:Validation (step 1750): loss = 0.0574449, global_step = 1729, accuracy = 0.966667
-```
-
-### Customizing the Evaluation Metrics with MetricSpec
-
-By default, if no evaluation metrics are specified, `ValidationMonitor` will log
-both [loss](https://en.wikipedia.org/wiki/Loss_function) and accuracy, but you
-can customize the list of metrics that will be run every 50 steps. To specify
-the exact metrics you'd like to run in each evaluation pass, you can add a
-`metrics` param to the `ValidationMonitor` constructor. `metrics` takes a dict
-of key/value pairs, where each key is the name you'd like logged for the metric,
-and the corresponding value is a
-[`MetricSpec`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/metric_spec.py)
-object.
-
-The `MetricSpec` constructor accepts four parameters:
-
-*   `metric_fn`. The function that calculates and returns the value of a metric.
-    This can be a predefined function available in the
-    @{tf.contrib.metrics} module, such as
-    @{tf.contrib.metrics.streaming_precision} or
-    @{tf.contrib.metrics.streaming_recall}.
-
-    Alternatively, you can define your own custom metric function, which must
-    take `predictions` and `labels` tensors as arguments (a `weights` argument
-    can also optionally be supplied). The function must return the value of the
-    metric in one of two formats:
-
-    *   A single tensor
-    *   A pair of ops `(value_op, update_op)`, where `value_op` returns the
-        metric value and `update_op` performs a corresponding operation to
-        update internal model state.
-
-*   `prediction_key`. The key of the tensor containing the predictions returned
-    by the model. This argument may be omitted if the model returns either a
-    single tensor or a dict with a single entry. For a `DNNClassifier` model,
-    class predictions will be returned in a tensor with the key
-    @{tf.contrib.learn.PredictionKey.CLASSES}.
-
-*   `label_key`. The key of the tensor containing the labels returned by the
-    model, as specified by the model's @{$input_fn$`input_fn`}. As
-    with `prediction_key`, this argument may be omitted if the `input_fn`
-    returns either a single tensor or a dict with a single entry. In the iris
-    example in this tutorial, the `DNNClassifier` does not have an `input_fn`
-    (`x`,`y` data is passed directly to `fit`), so it's not necessary to provide
-    a `label_key`.
-
-*   `weights_key`. *Optional*. The key of the tensor (returned by the
-    @{$input_fn$`input_fn`}) containing weights inputs for the
-    `metric_fn`.
-
-The following code creates a `validation_metrics` dict that defines three
-metrics to log during model evaluation:
-
-*   `"accuracy"`, using @{tf.contrib.metrics.streaming_accuracy}
-    as the `metric_fn`
-*   `"precision"`, using @{tf.contrib.metrics.streaming_precision}
-    as the `metric_fn`
-*   `"recall"`, using @{tf.contrib.metrics.streaming_recall}
-    as the `metric_fn`
-
-```python
-validation_metrics = {
-    "accuracy":
-        tf.contrib.learn.MetricSpec(
-            metric_fn=tf.contrib.metrics.streaming_accuracy,
-            prediction_key=tf.contrib.learn.PredictionKey.CLASSES),
-    "precision":
-        tf.contrib.learn.MetricSpec(
-            metric_fn=tf.contrib.metrics.streaming_precision,
-            prediction_key=tf.contrib.learn.PredictionKey.CLASSES),
-    "recall":
-        tf.contrib.learn.MetricSpec(
-            metric_fn=tf.contrib.metrics.streaming_recall,
-            prediction_key=tf.contrib.learn.PredictionKey.CLASSES)
-}
-```
-
-Add the above code before the `ValidationMonitor` constructor. Then revise the
-`ValidationMonitor` constructor as follows to add a `metrics` parameter to log
-the accuracy, precision, and recall metrics specified in `validation_metrics`
-(loss is always logged, and doesn't need to be explicitly specified):
-
-```python
-validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
-    test_set.data,
-    test_set.target,
-    every_n_steps=50,
-    metrics=validation_metrics)
-```
-
-Rerun the code, and you should see precision and recall included in your log
-output, e.g.:
-
-```none
-INFO:tensorflow:Validation (step 50): recall = 0.0, loss = 1.20626, global_step = 1, precision = 0.0, accuracy = 0.266667
-...
-INFO:tensorflow:Validation (step 600): recall = 1.0, loss = 0.0530696, global_step = 571, precision = 1.0, accuracy = 0.966667
-...
-INFO:tensorflow:Validation (step 1500): recall = 1.0, loss = 0.0617403, global_step = 1452, precision = 1.0, accuracy = 0.966667
-```
-
-### Early Stopping with ValidationMonitor
-
-Note that in the above log output, by step 600, the model has already achieved
-precision and recall rates of 1.0. This raises the question as to whether model
-training could benefit from
-[early stopping](https://en.wikipedia.org/wiki/Early_stopping).
-
-In addition to logging eval metrics, `ValidationMonitor`s make it easy to
-implement early stopping when specified conditions are met, via three params:
-
-| Param                            | Description                               |
-| -------------------------------- | ----------------------------------------- |
-| `early_stopping_metric`          | Metric that triggers early stopping       |
-:                                  : (e.g., loss or accuracy) under conditions :
-:                                  : specified in `early_stopping_rounds` and  :
-:                                  : `early_stopping_metric_minimize`. Default :
-:                                  : is `"loss"`.                              :
-| `early_stopping_metric_minimize` | `True` if desired model behavior is to    |
-:                                  : minimize the value of                     :
-:                                  : `early_stopping_metric`; `False` if       :
-:                                  : desired model behavior is to maximize the :
-:                                  : value of `early_stopping_metric`. Default :
-:                                  : is `True`.                                :
-| `early_stopping_rounds`          | Sets a number of steps during which if    |
-:                                  : the `early_stopping_metric` does not      :
-:                                  : decrease (if                              :
-:                                  : `early_stopping_metric_minimize` is       :
-:                                  : `True`) or increase (if                   :
-:                                  : `early_stopping_metric_minimize` is       :
-:                                  : `False`), training will be stopped.       :
-:                                  : Default is `None`, which means early      :
-:                                  : stopping will never occur.                :
-
-Make the following revision to the `ValidationMonitor` constructor, which
-specifies that if loss (`early_stopping_metric="loss"`) does not decrease
-(`early_stopping_metric_minimize=True`) over a period of 200 steps
-(`early_stopping_rounds=200`), model training will stop immediately at that
-point, and not complete the full 2000 steps specified in `fit`:
-
-```python
-validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
-    test_set.data,
-    test_set.target,
-    every_n_steps=50,
-    metrics=validation_metrics,
-    early_stopping_metric="loss",
-    early_stopping_metric_minimize=True,
-    early_stopping_rounds=200)
-```
-
-Rerun the code to see if model training stops early:
-
-```none
-...
-INFO:tensorflow:Validation (step 1150): recall = 1.0, loss = 0.056436, global_step = 1119, precision = 1.0, accuracy = 0.966667
-INFO:tensorflow:Stopping. Best step: 800 with loss = 0.048313818872.
-```
-
-Indeed, here training stops at step 1150, indicating that for the past 200
-steps, loss did not decrease, and that overall, step 800 produced the smallest
-loss value against the test data set. This suggests that additional calibration
-of hyperparameters by decreasing the step count might further improve the model.
-
-## Visualizing Log Data with TensorBoard
-
-Reading through the log produced by `ValidationMonitor` provides plenty of raw
-data on model performance during training, but it may also be helpful to see
-visualizations of this data to get further insight into trends&mdash;for
-example, how accuracy is changing over step count. You can use TensorBoard (a
-separate program packaged with TensorFlow) to plot graphs like this by setting
-the `logdir` command-line argument to the directory where you saved your model
-training data (here, `/tmp/iris_model`). Run the following on your command line:
-
-<pre><strong>$ tensorboard --logdir=/tmp/iris_model/</strong>
-Starting TensorBoard 39 on port 6006</pre>
-
-Then navigate to `http://0.0.0.0:`*`<port_number>`* in your browser, where
-*`<port_number>`* is the port specified in the command-line output (here,
-`6006`).
-
-If you click on the accuracy field, you'll see an image like the following,
-which shows accuracy plotted against step count:
-
-![Accuracy over step count in TensorBoard](https://www.tensorflow.org/images/validation_monitor_tensorboard_accuracy.png "Accuracy over step count in TensorBoard")
-
-For more on using TensorBoard, see @{$summaries_and_tensorboard$TensorBoard: Visualizing Learning} and @{$graph_viz$TensorBoard: Graph Visualization}.
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 3a153e8114..df622c6ac5 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.4.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.4.0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index df43255896..8b3da49a0d 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.4.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.4.0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index f7f2c3cdc7..6eb8158249 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.4.0-rc1</version>
+  <version>1.4.0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.4.0-rc1</version>
+                 <version>1.4.0</version>
                </dependency>
              </dependencies>
          </project>
@@ -124,7 +124,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.4.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.4.0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -143,7 +143,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.4.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.4.0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -151,10 +151,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.4.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.4.0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.4.0-rc1.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.4.0.zip).
   3. Extract this .zip file.
 
 
@@ -202,7 +202,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.4.0-rc1.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.4.0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -216,11 +216,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.4.0-rc1.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.4.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.4.0-rc1.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.4.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 414ab7b1f7..f7380bac8a 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -188,7 +188,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      Virtualenv environment:
 
      <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -293,7 +293,7 @@ take the following steps:
 
      <pre>
      $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0rc1-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -480,7 +480,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -648,14 +648,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -667,14 +667,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -686,14 +686,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -705,14 +705,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 9a95710bfa..79b383817b 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -114,7 +114,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0rc1-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py2-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -235,7 +235,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0rc1-py2-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py2-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -344,7 +344,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0rc1-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -517,7 +517,7 @@ This section documents the relevant values for Mac OS installations.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0rc1-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py2-none-any.whl
 </pre>
 
 
@@ -525,7 +525,7 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0rc1-py2-none-a
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0rc1-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py3-none-any.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 6d0dcdcd4a..aa4ae6c876 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -355,10 +355,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.4.0rc1 on Linux:
+for TensorFlow 1.4.0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.4.0rc1-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.4.0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -447,8 +447,10 @@ Stack Overflow and specify the `tensorflow` tag.
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.4.0rc1</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.4.0rc1</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>6</td><td>8</td></tr>
+<tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.4.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.5.4</td><td>6</td><td>8</td></tr>
+ <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.3.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>6</td><td>8</td></tr>
 <tr><td>tensorflow-1.2.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.2.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>5.1</td><td>8</td></tr>
 <tr><td>tensorflow-1.1.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.2</td><td>N/A</td><td>N/A</td></tr>
@@ -460,7 +462,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.4.0rc1</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
+ <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.2.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.1.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.2</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.1.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.2</td><td>5.1</td><td>8</td></tr>
@@ -471,8 +474,10 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.4.0rc1</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.4.0rc1</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>6</td><td>8</td></tr>
+<tr><td>tensorflow-1.4.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.4.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>6</td><td>8</td></tr>
+<tr><td>tensorflow-1.3.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.3.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>6</td><td>8</td></tr>
 <tr><td>tensorflow-1.2.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.2.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>5.1</td><td>8</td></tr>
 <tr><td>tensorflow-1.1.0</td><td>CPU</td><td>3.5</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/docs_src/mobile/index.md b/tensorflow/docs_src/mobile/index.md
index 06ad47bc62..a6f1422f6f 100644
--- a/tensorflow/docs_src/mobile/index.md
+++ b/tensorflow/docs_src/mobile/index.md
@@ -35,8 +35,8 @@ speech-driven interface, and many of these require on-device processing. Most of
 the time a user isn’t giving commands, and so streaming audio continuously to a
 remote server would be a waste of bandwidth, since it would mostly be silence or
 background noises. To solve this problem it’s common to have a small neural
-network running on-device @{$tutorials/audio_recognition$listening out for a particular keyword}.
-Once that keyword has been spotted, the rest of the
+network running on-device @{$tutorials/audio_recognition$listening out for a
+particular keyword}. Once that keyword has been spotted, the rest of the
 conversation can be transmitted over to the server for further processing if
 more computing power is needed.
 
diff --git a/tensorflow/docs_src/mobile/prepare_models.md b/tensorflow/docs_src/mobile/prepare_models.md
index c5a560e074..8fc65be35a 100644
--- a/tensorflow/docs_src/mobile/prepare_models.md
+++ b/tensorflow/docs_src/mobile/prepare_models.md
@@ -296,6 +296,6 @@ complains about missing header files, add the .h’s that are needed into
 the
 [`android_extended_ops`](https://www.tensorflow.org/code/tensorflow/core/kernels/BUILD#L3525) target.
 
-If you’re using a makefile targetting iOS, Raspberry Pi, etc, go to
+If you’re using a makefile targeting iOS, Raspberry Pi, etc, go to
 [`tensorflow/contrib/makefile/tf_op_files.txt`](https://www.tensorflow.org/code/tensorflow/contrib/makefile/tf_op_files.txt) and
 add the right implementation files there.
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index ccced8792e..3ca3b51a5e 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -901,95 +901,6 @@ are all 0. Figure below shows examples of different `edge_padding` and
   <img style="width:100%" src="https://www.tensorflow.org/images/ops_pad.png">
 </div>
 
-## Recv
-
-See also
-[`ComputationBuilder::Recv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
-
-<b> `Recv(shape, channel_handle)` </b>
-
-| Arguments        | Type            | Semantics                            |
-| ---------------- | --------------- | ------------------------------------ |
-| `shape`          | `Shape`         | shape of the data to receive         |
-| `channel_handle` | `ChannelHandle` | unique identifier for each send/recv pair |
-
-Receives data of the given shape from a `Send` instruction in another
-computation that shares the same channel handle. Returns a
-ComputationDataHandle for the received data.
-
-The client API of `Recv` operation represents synchronous communication.
-However, the instruction is internally decomposed into 2 HLO instructions
-(`Recv` and `RecvDone`) to enable asynchronous data transfers. See also
-[`HloInstruction::CreateRecv` and `HloInstruction::CreateRecvDone`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/hlo_instruction.h).
-
-<b>`Recv(const Shape& shape, int64 channel_id)`</b>
-
-Allocates resources required to receive data from a `Send` instruction with the
-same channel_id. Returns a context for the allocated resources, which is used
-by a following `RecvDone` instruction to wait for the completion of the data
-transfer. The context is a tuple of {receive buffer (shape), request identifier
-(U32)} and it can only be used by a `RecvDone` instruction.
-
-<b> `RecvDone(HloInstruction context)` </b>
-
-Given a context created by a `Recv` instruction, waits for the data transfer to
-complete and returns the received data.
-
-## Send
-
-See also
-[`ComputationBuilder::Send`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
-
-<b> `Send(operand, channel_handle)` </b>
-
-| Arguments        | Type                    | Semantics                        |
-| ---------------- | ----------------------- | -------------------------------- |
-| `operand`        | `ComputationDataHandle` | data to send (array of type T)   |
-| `channel_handle` | `ChannelHandle`         | unique identifier for each send/recv pair |
-
-Sends the given operand data to a `Recv` instruction in another computation
-that shares the same channel handle. Does not return any data.
-
-Similar to the `Recv` operation, the client API of `Send` operation represents
-synchronous communication, and is internally decomposed into 2 HLO instructions
-(`Send` and `SendDone`) to enable asynchronous data transfers. See also
-[`HloInstruction::CreateSend` and `HloInstruction::CreateSendDone`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/hlo_instruction.h).
-
-<b>`Send(HloInstruction operand, int64 channel_id)`</b>
-
-Initiates an asynchronous transfer of the operand to the resources allocated by
-the `Recv` instruction with the same channel id. Returns a context, which is
-used by a following `SendDone` instruction to wait for the completion of the
-data transfer. The context is a tuple of {operand (shape), request identifier
-(U32)} and it can only be used by a `SendDone` instruction.
-
-<b> `SendDone(HloInstruction context)` </b>
-
-Given a context created by a `Send` instruction, waits for the data transfer to
-complete.  The instruction does not return any data.
-
-<b> Scheduling of channel instructions </b>
-
-The execution order of the 4 instructions for each channel (`Recv`, `RecvDone`,
-`Send`, `SendDone`) is as below.
-
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:70%" src="../../images/send_recv_order.png">
-</div>
-
-* `Recv` happens before `Send`
-* `Send` happens before `RecvDone`
-* `Recv` happens before `RecvDone`
-* `Send` happens before `SendDone`
-
-When the backend compilers generate a linear schedule for each computation that
-communicates via channel instructions, there must not be cycles across the
-computations. For example, below schedules lead to deadlocks.
-
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/send_recv_schedule.png">
-</div>
-
 ## Reduce
 
 See also
diff --git a/tensorflow/docs_src/programmers_guide/debugger.md b/tensorflow/docs_src/programmers_guide/debugger.md
index 87d900eae3..dd5496b08e 100644
--- a/tensorflow/docs_src/programmers_guide/debugger.md
+++ b/tensorflow/docs_src/programmers_guide/debugger.md
@@ -157,6 +157,7 @@ Try the following commands at the `tfdbg>` prompt (referencing the code at
 | | `pt <tensor>[slicing]` | Print a subarray of tensor, using [numpy](http://www.numpy.org/)-style array slicing. | `pt hidden/Relu:0[0:50,:]` |
 | | `-a` | Print the entirety of a large tensor, without using ellipses. (May take a long time for large tensors.) | `pt -a hidden/Relu:0[0:50,:]` |
 | | `-r <range>` | Highlight elements falling into specified numerical range. Multiple ranges can be used in conjunction. | `pt hidden/Relu:0 -a -r [[-inf,-1],[1,inf]]` |
+| | `-n <number>` | Print dump corresponding to specified 0-based dump number. Required for tensors with multiple dumps. | `pt -n 0 hidden/Relu:0` |
 | | `-s` | Include a summary of the numeric values of the tensor (applicable only to non-empty tensors with Boolean and numeric types such as `int*` and `float*`.) | `pt -s hidden/Relu:0[0:50,:]` |
 | **`@[coordinates]`** | | Navigate to specified element in `pt` output. | `@[10,0]` or `@10,0` |
 | **`/regex`** | |  [less](https://linux.die.net/man/1/less)-style search for given regular expression. | `/inf` |
@@ -174,10 +175,12 @@ Try the following commands at the `tfdbg>` prompt (referencing the code at
 | | `-r` | List the inputs to node, recursively (the input tree.) | `li -r hidden/Relu:0` |
 | | `-d <max_depth>` | Limit recursion depth under the `-r` mode. | `li -r -d 3 hidden/Relu:0` |
 | | `-c` | Include control inputs. | `li -c -r hidden/Relu:0` |
+| | `-t` | Show op types of input nodes. | `li -t -r hidden/Relu:0` |
 | **`lo`** | | **List output recipients of node** | |
 | | `-r` | List the output recipients of node, recursively (the output tree.) | `lo -r hidden/Relu:0` |
 | | `-d <max_depth>` | Limit recursion depth under the `-r` mode. | `lo -r -d 3 hidden/Relu:0` |
 | | `-c` | Include recipients via control edges. | `lo -c -r hidden/Relu:0` |
+| | `-t` | Show op types of recipient nodes. | `lo -t -r hidden/Relu:0` |
 | **`ls`** | | **List Python source files involved in node creation.** | |
 | | `-p <path_pattern>` | Limit output to source files matching given regular-expression path pattern. | `ls -p .*debug_mnist.*` |
 | | `-n` | Limit output to node names matching given regular-expression pattern. | `ls -n Softmax.*` |
@@ -517,12 +520,8 @@ model.fit(...)  # This will break into the TFDBG CLI.
 
 ## Debugging tf-slim with TFDBG
 
-TFDBG supports debugging of training and evaluation with
+TFDBG currently supports only training with
 [tf-slim](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/slim).
-As detailed below, training and evaluation require slightly different debugging
-workflows.
-
-### Debugging training in tf-slim
 To debug the training process, provide `LocalCLIDebugWrapperSession` to the
 `session_wrapper` argument of `slim.learning.train()`. For example:
 
@@ -531,31 +530,13 @@ import tensorflow as tf
 from tensorflow.python import debug as tf_debug
 
 # ... Code that creates the graph and the train_op ...
-tf.contrib.slim.learning.train(
+tf.contrib.slim.learning_train(
     train_op,
     logdir,
     number_of_steps=10,
     session_wrapper=tf_debug.LocalCLIDebugWrapperSession)
 ```
 
-### Debugging evaluation in tf-slim
-To debug the evaluation process, provide `LocalCLIDebugHook` to the
-`hooks` argument of `slim.evaluation.evaluate_once()`. For example:
-
-``` python
-import tensorflow as tf
-from tensorflow.python import debug as tf_debug
-
-# ... Code that creates the graph and the eval and final ops ...
-tf.contrib.slim.evaluation.evaluate_once(
-    '',
-    checkpoint_path,
-    logdir,
-    eval_op=my_eval_op,
-    final_op=my_value_op,
-    hooks=[tf_debug.LocalCLIDebugHook()])
-```
-
 ## Offline Debugging of Remotely-Running Sessions
 
 Often, your model is running on a remote machine or a process that you don't
diff --git a/tensorflow/docs_src/programmers_guide/tensors.md b/tensorflow/docs_src/programmers_guide/tensors.md
index d6f80430cd..88eb277e35 100644
--- a/tensorflow/docs_src/programmers_guide/tensors.md
+++ b/tensorflow/docs_src/programmers_guide/tensors.md
@@ -29,8 +29,8 @@ Some types of tensors are special, and these will be covered in other
 units of the Programmer's guide. The main ones are:
 
   * `tf.Variable`
-  * `tf.Constant`
-  * `tf.Placeholder`
+  * `tf.constant`
+  * `tf.placeholder`
   * `tf.SparseTensor`
 
 With the exception of `tf.Variable`, the value of a tensor is immutable, which
@@ -64,7 +64,7 @@ The following snippet demonstrates creating a few rank 0 variables:
 mammal = tf.Variable("Elephant", tf.string)
 ignition = tf.Variable(451, tf.int16)
 floating = tf.Variable(3.14159265359, tf.float64)
-its_complicated = tf.Variable((12.3, -4.85), tf.complex64)
+its_complicated = tf.Variable(12.3 - 4.85j, tf.complex64)
 ```
 
 Note: A string is treated as a single item in TensorFlow, not as a sequence of
@@ -79,7 +79,7 @@ initial value. For example:
 mystr = tf.Variable(["Hello"], tf.string)
 cool_numbers  = tf.Variable([3.14159, 2.71828], tf.float32)
 first_primes = tf.Variable([2, 3, 5, 7, 11], tf.int32)
-its_very_complicated = tf.Variable([(12.3, -4.85), (7.5, -6.23)], tf.complex64)
+its_very_complicated = tf.Variable([12.3 - 4.85j, 7.5 - 6.23j], tf.complex64)
 ```
 
 
@@ -275,8 +275,8 @@ Graphs and Sessions for more information).
 
 Sometimes it is not possible to evaluate a `tf.Tensor` with no context because
 its value might depend on dynamic information that is not available. For
-example, tensors that depend on `Placeholder`s can't be evaluated without
-providing a value for the `Placeholder`.
+example, tensors that depend on `placeholder`s can't be evaluated without
+providing a value for the `placeholder`.
 
 ``` python
 p = tf.placeholder(tf.float32)
diff --git a/tensorflow/docs_src/tutorials/deep_cnn.md b/tensorflow/docs_src/tutorials/deep_cnn.md
index b57ef24f58..6f802fd106 100644
--- a/tensorflow/docs_src/tutorials/deep_cnn.md
+++ b/tensorflow/docs_src/tutorials/deep_cnn.md
@@ -83,21 +83,21 @@ for details.  It consists of 1,068,298 learnable parameters and requires about
 ## Code Organization
 
 The code for this tutorial resides in
-[`models/tutorials/image/cifar10/`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/).
+[`models/tutorials/image/cifar10/`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/).
 
 File | Purpose
 --- | ---
-[`cifar10_input.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_input.py) | Reads the native CIFAR-10 binary file format.
-[`cifar10.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10.py) | Builds the CIFAR-10 model.
-[`cifar10_train.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_train.py) | Trains a CIFAR-10 model on a CPU or GPU.
-[`cifar10_multi_gpu_train.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py) | Trains a CIFAR-10 model on multiple GPUs.
-[`cifar10_eval.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_eval.py) | Evaluates the predictive performance of a CIFAR-10 model.
+[`cifar10_input.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_input.py) | Reads the native CIFAR-10 binary file format.
+[`cifar10.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10.py) | Builds the CIFAR-10 model.
+[`cifar10_train.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_train.py) | Trains a CIFAR-10 model on a CPU or GPU.
+[`cifar10_multi_gpu_train.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_multi_gpu_train.py) | Trains a CIFAR-10 model on multiple GPUs.
+[`cifar10_eval.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_eval.py) | Evaluates the predictive performance of a CIFAR-10 model.
 
 
 ## CIFAR-10 Model
 
 The CIFAR-10 network is largely contained in
-[`cifar10.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10.py).
+[`cifar10.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10.py).
 The complete training
 graph contains roughly 765 operations. We find that we can make the code most
 reusable by constructing the graph with the following modules:
diff --git a/tensorflow/docs_src/tutorials/word2vec.md b/tensorflow/docs_src/tutorials/word2vec.md
index 0a1c41c84a..3fe7352bd2 100644
--- a/tensorflow/docs_src/tutorials/word2vec.md
+++ b/tensorflow/docs_src/tutorials/word2vec.md
@@ -23,7 +23,7 @@ straight in, feel free to look at the minimalistic implementation in
 This basic example contains the code needed to download some data, train on it a
 bit and visualize the result. Once you get comfortable with reading and running
 the basic version, you can graduate to
-[models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py)
+[models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py)
 which is a more serious implementation that showcases some more advanced
 TensorFlow principles about how to efficiently use threads to move data into a
 text model, how to checkpoint during training, etc.
@@ -341,7 +341,7 @@ t-SNE.
 Et voila! As expected, words that are similar end up clustering nearby each
 other. For a more heavyweight implementation of word2vec that showcases more of
 the advanced features of TensorFlow, see the implementation in
-[models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py).
+[models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
 
 ## Evaluating Embeddings: Analogical Reasoning
 
@@ -357,7 +357,7 @@ Download the dataset for this task from
 
 To see how we do this evaluation, have a look at the `build_eval_graph()` and
 `eval()` functions in
-[models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py).
+[models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
 
 The choice of hyperparameters can strongly influence the accuracy on this task.
 To achieve state-of-the-art performance on this task requires training over a
@@ -385,13 +385,13 @@ your model is seriously bottlenecked on input data, you may want to implement a
 custom data reader for your problem, as described in
 @{$new_data_formats$New Data Formats}.  For the case of Skip-Gram
 modeling, we've actually already done this for you as an example in
-[models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py).
+[models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
 
 If your model is no longer I/O bound but you want still more performance, you
 can take things further by writing your own TensorFlow Ops, as described in
 @{$adding_an_op$Adding a New Op}.  Again we've provided an
 example of this for the Skip-Gram case
-[models/tutorials/embedding/word2vec_optimized.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec_optimized.py).
+[models/tutorials/embedding/word2vec_optimized.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec_optimized.py).
 Feel free to benchmark these against each other to measure performance
 improvements at each stage.
 
diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py
index ebddfb20f4..3549891461 100644
--- a/tensorflow/examples/image_retraining/retrain.py
+++ b/tensorflow/examples/image_retraining/retrain.py
@@ -69,18 +69,11 @@ to validate that you have gathered good training data, but if you want to deploy
 on resource-limited platforms, you can try the `--architecture` flag with a
 Mobilenet model. For example:
 
-Run floating-point version of mobilenet:
 ```bash
 python tensorflow/examples/image_retraining/retrain.py \
     --image_dir ~/flower_photos --architecture mobilenet_1.0_224
 ```
 
-Run quantized version of mobilenet:
-```bash
-python tensorflow/examples/image_retraining/retrain.py \
-    --image_dir ~/flower_photos/   --architecture mobilenet_1.0_224_quantized
-```
-
 There are 32 different Mobilenet models to choose from, with a variety of file
 size and latency options. The first number can be '1.0', '0.75', '0.50', or
 '0.25' to control the size, and the second controls the input image size, either
@@ -114,7 +107,6 @@ import numpy as np
 from six.moves import urllib
 import tensorflow as tf
 
-from tensorflow.contrib.quantize.python import quant_ops
 from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.platform import gfile
@@ -279,7 +271,6 @@ def create_model_graph(model_info):
   """
   with tf.Graph().as_default() as graph:
     model_path = os.path.join(FLAGS.model_dir, model_info['model_file_name'])
-    print('Model path: ', model_path)
     with gfile.FastGFile(model_path, 'rb') as f:
       graph_def = tf.GraphDef()
       graph_def.ParseFromString(f.read())
@@ -346,10 +337,7 @@ def maybe_download_and_extract(data_url):
     statinfo = os.stat(filepath)
     tf.logging.info('Successfully downloaded', filename, statinfo.st_size,
                     'bytes.')
-    print('Extracting file from ', filepath)
-    tarfile.open(filepath, 'r:gz').extractall(dest_directory)
-  else:
-    print('Not extracting or downloading files, model already present in disk')
+  tarfile.open(filepath, 'r:gz').extractall(dest_directory)
 
 
 def ensure_dir_exists(dir_name):
@@ -745,7 +733,7 @@ def variable_summaries(var):
 
 
 def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor,
-                           bottleneck_tensor_size, quantize_layer):
+                           bottleneck_tensor_size):
   """Adds a new softmax and fully-connected layer for training.
 
   We need to retrain the top layer to identify our new classes, so this function
@@ -757,12 +745,10 @@ def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor,
 
   Args:
     class_count: Integer of how many categories of things we're trying to
-        recognize.
+    recognize.
     final_tensor_name: Name string for the new final node that produces results.
     bottleneck_tensor: The output of the main CNN graph.
     bottleneck_tensor_size: How many entries in the bottleneck vector.
-    quantize_layer: Boolean, specifying whether the newly added layer should be
-        quantized.
 
   Returns:
     The tensors for the training and cross entropy results, and tensors for the
@@ -785,41 +771,18 @@ def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor,
     with tf.name_scope('weights'):
       initial_value = tf.truncated_normal(
           [bottleneck_tensor_size, class_count], stddev=0.001)
+
       layer_weights = tf.Variable(initial_value, name='final_weights')
-      if quantize_layer:
-        quantized_layer_weights = quant_ops.MovingAvgQuantize(
-            layer_weights, is_training=True)
-        variable_summaries(quantized_layer_weights)
 
       variable_summaries(layer_weights)
     with tf.name_scope('biases'):
       layer_biases = tf.Variable(tf.zeros([class_count]), name='final_biases')
-      if quantize_layer:
-        quantized_layer_biases = quant_ops.MovingAvgQuantize(
-            layer_biases, is_training=True)
-        variable_summaries(quantized_layer_biases)
-
       variable_summaries(layer_biases)
-
     with tf.name_scope('Wx_plus_b'):
-      if quantize_layer:
-        logits = tf.matmul(bottleneck_input,
-                           quantized_layer_weights) + quantized_layer_biases
-        logits = quant_ops.MovingAvgQuantize(
-            logits,
-            init_min=-32.0,
-            init_max=32.0,
-            is_training=True,
-            num_bits=8,
-            narrow_range=False,
-            ema_decay=0.5)
-        tf.summary.histogram('pre_activations', logits)
-      else:
-        logits = tf.matmul(bottleneck_input, layer_weights) + layer_biases
-        tf.summary.histogram('pre_activations', logits)
+      logits = tf.matmul(bottleneck_input, layer_weights) + layer_biases
+      tf.summary.histogram('pre_activations', logits)
 
   final_tensor = tf.nn.softmax(logits, name=final_tensor_name)
-
   tf.summary.histogram('activations', final_tensor)
 
   with tf.name_scope('cross_entropy'):
@@ -827,7 +790,6 @@ def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor,
         labels=ground_truth_input, logits=logits)
     with tf.name_scope('total'):
       cross_entropy_mean = tf.reduce_mean(cross_entropy)
-
   tf.summary.scalar('cross_entropy', cross_entropy_mean)
 
   with tf.name_scope('train'):
@@ -863,7 +825,6 @@ def add_evaluation_step(result_tensor, ground_truth_tensor):
 def save_graph_to_file(sess, graph, graph_file_name):
   output_graph_def = graph_util.convert_variables_to_constants(
       sess, graph.as_graph_def(), [FLAGS.final_tensor_name])
-
   with gfile.FastGFile(graph_file_name, 'wb') as f:
     f.write(output_graph_def.SerializeToString())
   return
@@ -897,7 +858,6 @@ def create_model_info(architecture):
     ValueError: If architecture name is unknown.
   """
   architecture = architecture.lower()
-  is_quantized = False
   if architecture == 'inception_v3':
     # pylint: disable=line-too-long
     data_url = 'http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz'
@@ -942,28 +902,19 @@ def create_model_info(architecture):
             architecture)
         return None
       is_quantized = True
-
-    if is_quantized:
-      data_url = 'http://download.tensorflow.org/models/mobilenet_v1_'
-      data_url += version_string + '_' + size_string + '_quantized_frozen.tgz'
-      bottleneck_tensor_name = 'MobilenetV1/Predictions/Reshape:0'
-      resized_input_tensor_name = 'Placeholder:0'
-      model_dir_name = ('mobilenet_v1_' + version_string + '_' + size_string +
-                        '_quantized_frozen')
-      model_base_name = 'quantized_frozen_graph.pb'
-
-    else:
-      data_url = 'http://download.tensorflow.org/models/mobilenet_v1_'
-      data_url += version_string + '_' + size_string + '_frozen.tgz'
-      bottleneck_tensor_name = 'MobilenetV1/Predictions/Reshape:0'
-      resized_input_tensor_name = 'input:0'
-      model_dir_name = 'mobilenet_v1_' + version_string + '_' + size_string
-      model_base_name = 'frozen_graph.pb'
-
+    data_url = 'http://download.tensorflow.org/models/mobilenet_v1_'
+    data_url += version_string + '_' + size_string + '_frozen.tgz'
+    bottleneck_tensor_name = 'MobilenetV1/Predictions/Reshape:0'
     bottleneck_tensor_size = 1001
     input_width = int(size_string)
     input_height = int(size_string)
     input_depth = 3
+    resized_input_tensor_name = 'input:0'
+    if is_quantized:
+      model_base_name = 'quantized_graph.pb'
+    else:
+      model_base_name = 'frozen_graph.pb'
+    model_dir_name = 'mobilenet_v1_' + version_string + '_' + size_string
     model_file_name = os.path.join(model_dir_name, model_base_name)
     input_mean = 127.5
     input_std = 127.5
@@ -982,7 +933,6 @@ def create_model_info(architecture):
       'model_file_name': model_file_name,
       'input_mean': input_mean,
       'input_std': input_std,
-      'quantize_layer': is_quantized,
   }
 
 
@@ -1078,7 +1028,7 @@ def main(_):
     (train_step, cross_entropy, bottleneck_input, ground_truth_input,
      final_tensor) = add_final_training_ops(
          len(image_lists.keys()), FLAGS.final_tensor_name, bottleneck_tensor,
-         model_info['bottleneck_tensor_size'], model_info['quantize_layer'])
+         model_info['bottleneck_tensor_size'])
 
     # Create the operations we need to evaluate the accuracy of our new layer.
     evaluation_step, prediction = add_evaluation_step(
diff --git a/tensorflow/examples/image_retraining/retrain_test.py b/tensorflow/examples/image_retraining/retrain_test.py
index 2de4c4ec99..c342a17dd8 100644
--- a/tensorflow/examples/image_retraining/retrain_test.py
+++ b/tensorflow/examples/image_retraining/retrain_test.py
@@ -70,18 +70,10 @@ class ImageRetrainingTest(test_util.TensorFlowTestCase):
   def testAddFinalTrainingOps(self, flags_mock):
     with tf.Graph().as_default():
       with tf.Session() as sess:
-        bottleneck = tf.placeholder(tf.float32, [1, 1024], name='bottleneck')
-        # Test creating final training op with quantization
-        retrain.add_final_training_ops(5, 'final', bottleneck, 1024, False)
-        self.assertIsNotNone(sess.graph.get_tensor_by_name('final:0'))
-
-  @tf.test.mock.patch.object(retrain, 'FLAGS', learning_rate=0.01)
-  def testAddFinalTrainingOpsQuantized(self, flags_mock):
-    with tf.Graph().as_default():
-      with tf.Session() as sess:
-        bottleneck = tf.placeholder(tf.float32, [1, 1024], name='bottleneck')
-        # Test creating final training op with quantization
-        retrain.add_final_training_ops(5, 'final', bottleneck, 1024, True)
+        bottleneck = tf.placeholder(
+            tf.float32, [1, 1024],
+            name='bottleneck')
+        retrain.add_final_training_ops(5, 'final', bottleneck, 1024)
         self.assertIsNotNone(sess.graph.get_tensor_by_name('final:0'))
 
   def testAddEvaluationStep(self):
@@ -107,12 +99,5 @@ class ImageRetrainingTest(test_util.TensorFlowTestCase):
     self.assertIsNotNone(model_info)
     self.assertEqual(299, model_info['input_width'])
 
-  def testCreateModelInfoQuantized(self):
-    # Test for mobilenet_quantized
-    model_info = retrain.create_model_info('mobilenet_1.0_224')
-    self.assertIsNotNone(model_info)
-    self.assertEqual(224, model_info['input_width'])
-
-
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/examples/learn/iris.py b/tensorflow/examples/learn/iris.py
index 03e60972aa..0a50b3ba87 100644
--- a/tensorflow/examples/learn/iris.py
+++ b/tensorflow/examples/learn/iris.py
@@ -11,10 +11,7 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-"""Example of DNNClassifier for Iris plant dataset.
-
-This example uses APIs in Tensorflow 1.4 or above.
-"""
+"""Example of DNNClassifier for Iris plant dataset."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/examples/learn/wide_n_deep_tutorial.py b/tensorflow/examples/learn/wide_n_deep_tutorial.py
index 072353392a..e447b3e24e 100644
--- a/tensorflow/examples/learn/wide_n_deep_tutorial.py
+++ b/tensorflow/examples/learn/wide_n_deep_tutorial.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Example code for TensorFlow Wide & Deep Tutorial using TF High Level API.
-
-This example uses APIs in Tensorflow 1.4 or above.
-"""
+"""Example code for TensorFlow Wide & Deep Tutorial using TF.Learn API."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/examples/speech_commands/models.py b/tensorflow/examples/speech_commands/models.py
index 82d6a94ea1..ab611f414a 100644
--- a/tensorflow/examples/speech_commands/models.py
+++ b/tensorflow/examples/speech_commands/models.py
@@ -326,7 +326,7 @@ def create_low_latency_conv_model(fingerprint_input, model_settings,
   first_filter_height = input_time_size
   first_filter_count = 186
   first_filter_stride_x = 1
-  first_filter_stride_y = 4
+  first_filter_stride_y = 1
   first_weights = tf.Variable(
       tf.truncated_normal(
           [first_filter_height, first_filter_width, 1, first_filter_count],
diff --git a/tensorflow/go/android.go b/tensorflow/go/android.go
new file mode 100644
index 0000000000..f7d666b7a9
--- /dev/null
+++ b/tensorflow/go/android.go
@@ -0,0 +1,6 @@
+// +build android
+
+package tensorflow
+
+// #cgo LDFLAGS: -landroid -llog -lm -lz -ldl
+import "C"
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index eb79da5384..4e5d17f76f 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -62,29 +62,6 @@ func WriteScalarSummary(scope *Scope, writer tf.Output, global_step tf.Output, t
 	return scope.AddOperation(opspec)
 }
 
-// Outputs a `tf.Event` protocol buffer.
-//
-// When CreateSummaryDbWriter is being used, this op can be useful for
-// importing data from event logs.
-//
-// Arguments:
-//	writer: A handle to a summary writer.
-//	event: A string containing a binary-encoded tf.Event proto.
-//
-// Returns the created operation.
-func ImportEvent(scope *Scope, writer tf.Output, event tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ImportEvent",
-		Input: []tf.Input{
-			writer, event,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Outputs a `Summary` protocol buffer with a tensor.
 //
 // Arguments:
@@ -4006,6 +3983,41 @@ func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value t
 	return op.Output(0)
 }
 
+// Identity op for gradient debugging.
+//
+// This op is hidden from public in Python. It is used by TensorFlow Debugger to
+// register gradient tensors for gradient debugging.
+func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DebugGradientIdentity",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated. Use TensorArrayGradV3
+func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"source": source}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayGradV2",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Get the current size of the TensorArray.
 //
 // Arguments:
@@ -4539,6 +4551,31 @@ func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr)
 	return scope.AddOperation(opspec)
 }
 
+// Concatenates tensors along one dimension.
+//
+// Arguments:
+//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [-rank(values), rank(values)).
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConcatV2",
+		Input: []tf.Input{
+			tf.OutputList(values), axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
 type QueueDequeueUpToV2Attr func(optionalAttr)
 
@@ -4955,6 +4992,80 @@ func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV
 	return op.Output(0)
 }
 
+// FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
+type FIFOQueueV2Attr func(optionalAttr)
+
+// FIFOQueueV2Shapes sets the optional shapes attribute to value.
+//
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shapes"] = value
+	}
+}
+
+// FIFOQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func FIFOQueueV2Capacity(value int64) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// FIFOQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FIFOQueueV2Container(value string) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// FIFOQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that produces elements in first-in first-out order.
+//
+// Arguments:
+//	component_types: The type of each component in a value.
+//
+// Returns The handle to the queue.
+func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQueueV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FIFOQueueV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // StridedSliceAttr is an optional argument to StridedSlice.
 type StridedSliceAttr func(optionalAttr)
 
@@ -5334,101 +5445,6 @@ func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged
 	return op.Output(0)
 }
 
-// FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
-type FIFOQueueV2Attr func(optionalAttr)
-
-// FIFOQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
-	}
-}
-
-// FIFOQueueV2Capacity sets the optional capacity attribute to value.
-//
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func FIFOQueueV2Capacity(value int64) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// FIFOQueueV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func FIFOQueueV2Container(value string) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// FIFOQueueV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that produces elements in first-in first-out order.
-//
-// Arguments:
-//	component_types: The type of each component in a value.
-//
-// Returns The handle to the queue.
-func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQueueV2Attr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FIFOQueueV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts the given `resource_handle` representing an iterator to a variant tensor.
-//
-// Arguments:
-//	resource_handle: A handle to an iterator resource.
-//
-// Returns A variant tensor storing the state of the iterator contained in the
-// resource.
-func SerializeIterator(scope *Scope, resource_handle tf.Output) (serialized tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SerializeIterator",
-		Input: []tf.Input{
-			resource_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Return a tensor with the same shape and contents as the input tensor or value.
 func Identity(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
@@ -5560,39 +5576,6 @@ func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_han
 	return op.Output(0)
 }
 
-// Outputs the single element from the given dataset.
-//
-// Arguments:
-//	dataset: A handle to a dataset that contains a single element.
-//
-//
-//
-// Returns The components of the single element of `input`.
-func DatasetToSingleElement(scope *Scope, dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "DatasetToSingleElement",
-		Input: []tf.Input{
-			dataset,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("DatasetToSingleElement", err)
-		return
-	}
-	return components
-}
-
 // Gets the next output from the given iterator.
 func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
@@ -5713,30 +5696,6 @@ func FixedLengthRecordDataset(scope *Scope, filenames tf.Output, header_bytes tf
 	return op.Output(0)
 }
 
-// Creates a dataset that executes a SQL query and emits rows of the result set.
-//
-// Arguments:
-//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
-//	data_source_name: A connection string to connect to the database.
-//	query: A SQL query to execute.
-//
-//
-func SqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "SqlDataset",
-		Input: []tf.Input{
-			driver_name, data_source_name, query,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // PlaceholderAttr is an optional argument to Placeholder.
 type PlaceholderAttr func(optionalAttr)
 
@@ -5807,68 +5766,6 @@ func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, out
 	return op.Output(0)
 }
 
-// Identity op for gradient debugging.
-//
-// This op is hidden from public in Python. It is used by TensorFlow Debugger to
-// register gradient tensors for gradient debugging.
-func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DebugGradientIdentity",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deprecated. Use TensorArrayGradV3
-func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"source": source}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV2",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that yields a SparseTensor for each element of the input.
-//
-// Arguments:
-//	input_dataset: A handle to an input dataset. Must have a single component.
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	row_shape: A vector representing the dense shape of each row in the produced
-// SparseTensor. The shape may be partially specified, using `-1` to indicate
-// that a particular dimension should use the maximum size of all batch elements.
-//
-//
-func DenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "DenseToSparseBatchDataset",
-		Input: []tf.Input{
-			input_dataset, batch_size, row_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Creates a dataset that batches and pads `batch_size` elements from the input.
 //
 // Arguments:
@@ -5929,69 +5826,6 @@ func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtyp
 	return op.Output(0), op.Output(1)
 }
 
-// Converts the given variant tensor to an iterator and stores it in the given resource.
-//
-// Arguments:
-//	resource_handle: A handle to an iterator resource.
-//	serialized: A variant tensor storing the state of the iterator contained in the
-// resource.
-//
-// Returns the created operation.
-func DeserializeIterator(scope *Scope, resource_handle tf.Output, serialized tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DeserializeIterator",
-		Input: []tf.Input{
-			resource_handle, serialized,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Concatenates tensors along one dimension.
-//
-// Arguments:
-//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [-rank(values), rank(values)).
-//
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConcatV2",
-		Input: []tf.Input{
-			tf.OutputList(values), axis,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that contains the elements of `input_dataset` ignoring errors.
-func IgnoreErrorsDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "IgnoreErrorsDataset",
-		Input: []tf.Input{
-			input_dataset,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Creates a dataset that concatenates `input_dataset` with `another_dataset`.
 func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
@@ -22477,39 +22311,6 @@ func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input t
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Creates summary database writer accessible by given resource handle.
-//
-// This can be used to write tensors from the execution graph directly
-// to a database. Only SQLite is supported right now. This function
-// will create the schema if it doesn't exist. Entries in the Users,
-// Experiments, and Runs tables will be created automatically if they
-// don't already exist.
-//
-// Arguments:
-//	writer: Handle to SummaryWriter resource to overwrite.
-//	db_uri: For example "file:/tmp/foo.sqlite".
-//	experiment_name: Can't contain ASCII control characters or <>. Case
-// sensitive. If empty, then the Run will not be associated with any
-// Experiment.
-//	run_name: Can't contain ASCII control characters or <>. Case sensitive.
-// If empty, then each Tag will not be associated with any Run.
-//	user_name: Must be valid as both a DNS label and Linux username. If
-// empty, then the Experiment will not be associated with any User.
-//
-// Returns the created operation.
-func CreateSummaryDbWriter(scope *Scope, writer tf.Output, db_uri tf.Output, experiment_name tf.Output, run_name tf.Output, user_name tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "CreateSummaryDbWriter",
-		Input: []tf.Input{
-			writer, db_uri, experiment_name, run_name, user_name,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
 type HistogramFixedWidthAttr func(optionalAttr)
 
diff --git a/tensorflow/go/operation_test.go b/tensorflow/go/operation_test.go
index 7cba043af2..40c951ab8c 100644
--- a/tensorflow/go/operation_test.go
+++ b/tensorflow/go/operation_test.go
@@ -123,6 +123,14 @@ func TestOutputDataTypeAndShape(t *testing.T) {
 			[]int64{2, 3},
 			Double,
 		},
+		{ // Matrix of Uint64
+			[][]uint64{
+				{1, 2, 3},
+				{4, 5, 6},
+			},
+			[]int64{2, 3},
+			Uint64,
+		},
 	}
 	for idx, test := range testdata {
 		t.Run(fmt.Sprintf("#%d Value %T", idx, test.Value), func(t *testing.T) {
diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index 36a74c0081..1326a95278 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -101,7 +101,7 @@ func NewTensor(value interface{}) (*Tensor, error) {
 			return nil, bug("NewTensor incorrectly calculated the size of a tensor with type %v and shape %v as %v bytes instead of %v", dataType, shape, nbytes, buf.Len())
 		}
 	} else {
-		e := stringEncoder{offsets: buf, data: raw[nflattened*8 : len(raw)], status: newStatus()}
+		e := stringEncoder{offsets: buf, data: raw[nflattened*8:], status: newStatus()}
 		if err := e.encode(reflect.ValueOf(value), shape); err != nil {
 			return nil, err
 		}
@@ -207,6 +207,9 @@ func (t *Tensor) WriteContentsTo(w io.Writer) (int64, error) {
 func tensorData(c *C.TF_Tensor) []byte {
 	// See: https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
 	cbytes := C.TF_TensorData(c)
+	if cbytes == nil {
+		return nil
+	}
 	length := int(C.TF_TensorByteSize(c))
 	slice := (*[1 << 30]byte)(unsafe.Pointer(cbytes))[:length:length]
 	return slice
@@ -310,7 +313,7 @@ func encodeTensor(w *bytes.Buffer, v reflect.Value, shape []int64) error {
 		if err := w.WriteByte(b); err != nil {
 			return err
 		}
-	case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
+	case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
 		if err := binary.Write(w, nativeEndian, v.Interface()); err != nil {
 			return err
 		}
@@ -349,7 +352,7 @@ func decodeTensor(r *bytes.Reader, shape []int64, typ reflect.Type, ptr reflect.
 			return err
 		}
 		ptr.Elem().SetBool(b == 1)
-	case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
+	case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
 		if err := binary.Read(r, nativeEndian, ptr.Interface()); err != nil {
 			return err
 		}
diff --git a/tensorflow/go/tensor_test.go b/tensorflow/go/tensor_test.go
index 35bd2fd9a5..674a8ce86f 100644
--- a/tensorflow/go/tensor_test.go
+++ b/tensorflow/go/tensor_test.go
@@ -34,11 +34,15 @@ func TestNewTensor(t *testing.T) {
 		{nil, int64(5)},
 		{nil, uint8(5)},
 		{nil, uint16(5)},
+		{nil, uint32(5)},
+		{nil, uint64(5)},
 		{nil, float32(5)},
 		{nil, float64(5)},
 		{nil, complex(float32(5), float32(6))},
 		{nil, complex(float64(5), float64(6))},
 		{nil, "a string"},
+		{[]int64{1}, []uint32{1}},
+		{[]int64{1}, []uint64{1}},
 		{[]int64{2}, []bool{true, false}},
 		{[]int64{1}, []float64{1}},
 		{[]int64{1}, [1]float64{1}},
@@ -71,11 +75,6 @@ func TestNewTensor(t *testing.T) {
 		// native ints not supported
 		int(5),
 		[]int{5},
-		// uint32 and uint64 are not supported in TensorFlow
-		uint32(5),
-		[]uint32{5},
-		uint64(5),
-		[]uint64{5},
 		// Mismatched dimensions
 		[][]float32{{1, 2, 3}, {4}},
 		// Mismatched dimensions. Should return "mismatched slice lengths" error instead of "BUG"
diff --git a/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java b/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
index 499757e8cf..2b431eebf5 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
@@ -43,6 +43,7 @@ final class NativeLibrary {
   private static final boolean DEBUG =
       System.getProperty("org.tensorflow.NativeLibrary.DEBUG") != null;
   private static final String JNI_LIBNAME = "tensorflow_jni";
+  private static final String FRAMEWORK_LIBNAME = "tensorflow_framework";
 
   public static void load() {
     if (isLoaded() || tryLoadLibrary()) {
@@ -58,15 +59,12 @@ final class NativeLibrary {
     }
     // Native code is not present, perhaps it has been packaged into the .jar file containing this.
     // Extract the JNI library itself
-    final String jniLibName = System.mapLibraryName(JNI_LIBNAME);
-    final String jniResourceName = makeResourceName(jniLibName);
+    final String jniResourceName = makeResourceName(JNI_LIBNAME);
     log("jniResourceName: " + jniResourceName);
     final InputStream jniResource =
         NativeLibrary.class.getClassLoader().getResourceAsStream(jniResourceName);
     // Extract the JNI's dependency
-    final String frameworkLibName =
-        maybeAdjustForMacOS(System.mapLibraryName("tensorflow_framework"));
-    final String frameworkResourceName = makeResourceName(frameworkLibName);
+    final String frameworkResourceName = makeResourceName(FRAMEWORK_LIBNAME);
     log("frameworkResourceName: " + frameworkResourceName);
     final InputStream frameworkResource =
         NativeLibrary.class.getClassLoader().getResourceAsStream(frameworkResourceName);
@@ -90,15 +88,12 @@ final class NativeLibrary {
       tempPath.deleteOnExit();
       final String tempDirectory = tempPath.toString();
       if (frameworkResource != null) {
-        extractResource(frameworkResource, frameworkLibName, tempDirectory);
+        extractResource(frameworkResource, FRAMEWORK_LIBNAME, tempDirectory);
       } else {
-        log(
-            frameworkResourceName
-                + " not found. This is fine assuming "
-                + jniResourceName
-                + " is not built to depend on it.");
+        log(frameworkResourceName + " not found. This is fine assuming " + jniResourceName
+            + " is not built to depend on it.");
       }
-      System.load(extractResource(jniResource, jniLibName, tempDirectory));
+      System.load(extractResource(jniResource, JNI_LIBNAME, tempDirectory));
     } catch (IOException e) {
       throw new UnsatisfiedLinkError(
           String.format(
@@ -126,27 +121,9 @@ final class NativeLibrary {
     }
   }
 
-  private static String maybeAdjustForMacOS(String libFilename) {
-    if (!System.getProperty("os.name").contains("OS X")) {
-      return libFilename;
-    }
-    // This is macOS, and the TensorFlow release process might have setup dependencies on
-    // libtensorflow_framework.so instead of libtensorflow_framework.dylib. Adjust for that.
-    final ClassLoader cl = NativeLibrary.class.getClassLoader();
-    if (cl.getResource(makeResourceName(libFilename)) != null) {
-      return libFilename;
-    }
-    // liftensorflow_framework.dylib not found, try libtensorflow_framework.so
-    final String suffix = ".dylib";
-    if (!libFilename.endsWith(suffix)) {
-      return libFilename;
-    }
-    return libFilename.substring(0, libFilename.length() - suffix.length()) + ".so";
-  }
-
   private static String extractResource(
       InputStream resource, String resourceName, String extractToDirectory) throws IOException {
-    final File dst = new File(extractToDirectory, resourceName);
+    final File dst = new File(extractToDirectory, System.mapLibraryName(resourceName));
     dst.deleteOnExit();
     final String dstPath = dst.toString();
     log("extracting native library to: " + dstPath);
@@ -180,7 +157,9 @@ final class NativeLibrary {
   }
 
   private static String makeResourceName(String baseName) {
-    return "org/tensorflow/native/" + String.format("%s-%s/", os(), architecture()) + baseName;
+    return "org/tensorflow/native/"
+        + String.format("%s-%s/", os(), architecture())
+        + System.mapLibraryName(baseName);
   }
 
   private static long copy(InputStream src, File dstFile) throws IOException {
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Shape.java b/tensorflow/java/src/main/java/org/tensorflow/Shape.java
index 9aa92be111..d533c3d480 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Shape.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Shape.java
@@ -77,6 +77,24 @@ public final class Shape {
     return shape[i];
   }
 
+  @Override
+  public int hashCode() {
+    return Arrays.hashCode(shape);
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+
+    if (obj instanceof Shape && Arrays.equals(this.shape, ((Shape) obj).shape)) {
+      return !hasUnknownDimension();
+    }
+
+    return super.equals(obj);
+  }
+
   /** Succinct description of the shape meant for debugging. */
   @Override
   public String toString() {
@@ -98,4 +116,18 @@ public final class Shape {
   }
 
   private long[] shape;
+
+  private boolean hasUnknownDimension() {
+    if (shape == null) {
+      return true;
+    }
+
+    for (long dimension : shape) {
+      if (dimension == -1) {
+        return true;
+      }
+    }
+
+    return false;
+  }
 }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFBool.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFBool.java
deleted file mode 100644
index ab34f6aa12..0000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFBool.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// GENERATED FILE. To update, edit tftypes.pl instead.
-
-package org.tensorflow.types;
-
-import org.tensorflow.DataType;
-
-/** Represents a boolean. */
-public class TFBool implements TFType {
-  private TFBool() {}
-  static {
-    Types.typeCodes.put(TFBool.class, DataType.BOOL);
-  }
-  static {
-    Types.scalars.put(TFBool.class, false);
-  }
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFDouble.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFDouble.java
deleted file mode 100644
index 49e5d9f2f3..0000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFDouble.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// GENERATED FILE. To update, edit tftypes.pl instead.
-
-package org.tensorflow.types;
-
-import org.tensorflow.DataType;
-
-/** Represents a 64-bit double precision floating point number. */
-public class TFDouble implements TFType {
-  private TFDouble() {}
-  static {
-    Types.typeCodes.put(TFDouble.class, DataType.DOUBLE);
-  }
-  static {
-    Types.scalars.put(TFDouble.class, 0.0);
-  }
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFFloat.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFFloat.java
deleted file mode 100644
index 8426ee41f0..0000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFFloat.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// GENERATED FILE. To update, edit tftypes.pl instead.
-
-package org.tensorflow.types;
-
-import org.tensorflow.DataType;
-
-/** Represents a 32-bit single precision floating point number. */
-public class TFFloat implements TFType {
-  private TFFloat() {}
-  static {
-    Types.typeCodes.put(TFFloat.class, DataType.FLOAT);
-  }
-  static {
-    Types.scalars.put(TFFloat.class, 0f);
-  }
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFInt32.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFInt32.java
deleted file mode 100644
index 3947b6ad09..0000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFInt32.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// GENERATED FILE. To update, edit tftypes.pl instead.
-
-package org.tensorflow.types;
-
-import org.tensorflow.DataType;
-
-/** Represents a 32-bit signed integer. */
-public class TFInt32 implements TFType {
-  private TFInt32() {}
-  static {
-    Types.typeCodes.put(TFInt32.class, DataType.INT32);
-  }
-  static {
-    Types.scalars.put(TFInt32.class, 0);
-  }
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFInt64.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFInt64.java
deleted file mode 100644
index ccdded8693..0000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFInt64.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// GENERATED FILE. To update, edit tftypes.pl instead.
-
-package org.tensorflow.types;
-
-import org.tensorflow.DataType;
-
-/** Represents a 64-bit signed integer. */
-public class TFInt64 implements TFType {
-  private TFInt64() {}
-  static {
-    Types.typeCodes.put(TFInt64.class, DataType.INT64);
-  }
-  static {
-    Types.scalars.put(TFInt64.class, 0L);
-  }
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFString.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFString.java
deleted file mode 100644
index e7327e8c57..0000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFString.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// GENERATED FILE. To update, edit tftypes.pl instead.
-
-package org.tensorflow.types;
-
-import org.tensorflow.DataType;
-
-/** Represents an arbitrary sequence of bytes. */
-public class TFString implements TFType {
-  private TFString() {}
-  static {
-    Types.typeCodes.put(TFString.class, DataType.STRING);
-  }
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFType.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFType.java
deleted file mode 100644
index 562953ac9d..0000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFType.java
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-package org.tensorflow.types;
-
-/**
- * A marker interface for classes representing TensorFlow types.
- */
-public interface TFType {}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFUInt8.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFUInt8.java
deleted file mode 100644
index d7305ca5a8..0000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/TFUInt8.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// GENERATED FILE. To update, edit tftypes.pl instead.
-
-package org.tensorflow.types;
-
-import org.tensorflow.DataType;
-
-/** Represents an 8-bit unsigned integer. */
-public class TFUInt8 implements TFType {
-  private TFUInt8() {}
-  static {
-    Types.typeCodes.put(TFUInt8.class, DataType.UINT8);
-  }
-  static {
-    Types.scalars.put(TFUInt8.class, (byte)0);
-  }
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/Types.java b/tensorflow/java/src/main/java/org/tensorflow/types/Types.java
deleted file mode 100644
index 976cd9fd34..0000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/types/Types.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-package org.tensorflow.types;
-
-import java.util.HashMap;
-import java.util.Map;
-import org.tensorflow.DataType;
-
-/**
- * Utility class for managing the representation of TensorFlow types as Java
- * types. For each TensorFlow type (e.g., int32), there is a corresponding Java
- * type (e.g., TFInt32) that represents it at compile time and a corresponding
- * class object (e.g., TFInt32.class) that represents it at run time. There is
- * also an enumeration value in DataType that can be used to represent the
- * type, though that should rarely be required.
- */
-public class Types {
-
-  private Types() {} // not instantiable
-
-  static final Map<Class<?>, DataType> typeCodes = new HashMap<>();
-
-  /** Returns the DataType value corresponding to a TensorFlow type class. */
-  public static DataType dataType(Class<? extends TFType> c) {
-    DataType dtype = typeCodes.get(c);
-    if (dtype == null) {
-      throw new IllegalArgumentException("" + c + " is not a TensorFlow type.");
-    }
-    return dtype;
-  }
-
-  static final Map<Class<?>, Object> scalars = new HashMap<>();
-
-  /** Returns the zero value of type described by {@code c}, or null if
-   *  the type (e.g., string) is not numeric and therefore has no zero value.
-   */
-  public static Object zeroValue(Class<? extends TFType> c) {
-    return scalars.get(c);
-  }
-}
diff --git a/tensorflow/java/src/test/java/org/tensorflow/ShapeTest.java b/tensorflow/java/src/test/java/org/tensorflow/ShapeTest.java
index 3b027700c5..92cc3bd60e 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/ShapeTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/ShapeTest.java
@@ -16,6 +16,7 @@ limitations under the License.
 package org.tensorflow;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
 
 import org.junit.Test;
 import org.junit.runner.RunWith;
@@ -77,4 +78,29 @@ public class ShapeTest {
       assertEquals(5, n.shape().size(1));
     }
   }
+
+  @Test
+  public void equalsWorksCorrectly() {
+    assertEquals(Shape.scalar(), Shape.scalar());
+    assertEquals(Shape.make(1, 2, 3), Shape.make(1, 2, 3));
+
+    assertNotEquals(Shape.make(1,2), null);
+    assertNotEquals(Shape.make(1,2), new Object());
+    assertNotEquals(Shape.make(1, 2, 3), Shape.make(1, 2, 4));
+
+
+    assertNotEquals(Shape.unknown(), Shape.unknown());
+    assertNotEquals(Shape.make(-1), Shape.make(-1));
+    assertNotEquals(Shape.make(1, -1, 3), Shape.make(1, -1, 3));
+  }
+
+  @Test
+  public void hashCodeIsAsExpected() {
+    assertEquals(Shape.make(1, 2, 3, 4).hashCode(), Shape.make(1, 2, 3, 4).hashCode());
+    assertEquals(Shape.scalar().hashCode(), Shape.scalar().hashCode());
+    assertEquals(Shape.unknown().hashCode(), Shape.unknown().hashCode());
+
+    assertNotEquals(Shape.make(1, 2).hashCode(), Shape.make(1, 3).hashCode());
+  }
 }
+
diff --git a/tensorflow/python/client/session_clusterspec_prop_test.py b/tensorflow/python/client/session_clusterspec_prop_test.py
index 28a4dd27a7..b77912b4f7 100644
--- a/tensorflow/python/client/session_clusterspec_prop_test.py
+++ b/tensorflow/python/client/session_clusterspec_prop_test.py
@@ -169,7 +169,7 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
     # BaseRemoteRendezvous::SameWorkerRecvDone that means the test doesn't
     # actually capture the motivating bug unless run on a GPU machine.
     #
-    # Example error message (before bugfix -- line breaks added because  lint):
+    # Example error message (before bugfix -- linebreaks added because  lint):
     #
     # W0718 17:14:41.521534  190121 device_mgr.cc:107] Unknown device:
     #     /job:worker/replica:0/task:0/device:CPU:0 all devices:
diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index 40731aba7d..f45bc13602 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -344,6 +344,16 @@ bool PyTensorListToVector(PyObject* py_tensor_list,
 %rename("_TF_SetConfig") TF_SetConfig;
 %rename("_TF_NewSessionOptions") TF_NewSessionOptions;
 
+// Create temporary int64_t to pass to TF_OperationGetAttrInt
+%typemap(in, numinputs=0) int64_t* value (int64_t val) {
+  $1 = &val;
+}
+
+// Convert value to Python int
+%typemap(argout) int64_t* value {
+  $result = PyInt_FromLong(*$1);
+}
+
 %include "tensorflow/c/c_api.h"
 %include "tensorflow/c/python_api.h"
 
diff --git a/tensorflow/python/client/timeline.py b/tensorflow/python/client/timeline.py
index 1e96ac5ed4..f3ba4244ce 100644
--- a/tensorflow/python/client/timeline.py
+++ b/tensorflow/python/client/timeline.py
@@ -275,7 +275,7 @@ class _TensorTracker(object):
       name:  The name of the Tensor as a string.
       object_id:  Chrome Trace object identifier assigned for this Tensor.
       timestamp:  The creation timestamp of this event as a long integer.
-      pid:  Process identifier of the associated device, as an integer.
+      pid:  Process identifier of the assicaiated device, as an integer.
       allocator:  Name of the allocator used to create the Tensor.
       num_bytes:  Number of bytes allocated (long integer).
 
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
index acea9433e2..d987ba84b5 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
@@ -111,20 +111,6 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual(repr(self.inc_v), dump.run_fetches_info)
     self.assertEqual(repr(None), dump.run_feed_keys_info)
 
-  def testDumpingOnASingleRunWorksWithRelativePathForDebugDumpDir(self):
-    sess = dumping_wrapper.DumpingDebugWrapperSession(
-        self.sess, session_root=self.session_root, log_usage=False)
-    sess.run(self.inc_v)
-    dump_dirs = glob.glob(os.path.join(self.session_root, "run_*"))
-    cwd = os.getcwd()
-    try:
-      os.chdir(self.session_root)
-      dump = debug_data.DebugDumpDir(
-          os.path.relpath(dump_dirs[0], self.session_root))
-      self.assertAllClose([10.0], dump.get_tensors("v", 0, "DebugIdentity"))
-    finally:
-      os.chdir(cwd)
-
   def testDumpingOnASingleRunWithFeedDictWorks(self):
     sess = dumping_wrapper.DumpingDebugWrapperSession(
         self.sess, session_root=self.session_root, log_usage=False)
@@ -364,14 +350,12 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
         thread_name_filter=r"MainThread$")
 
     self.assertAllClose(1.0, sess.run(self.delta))
-    child_thread_result = []
     def child_thread_job():
-      child_thread_result.append(sess.run(self.eta))
+      sess.run(sess.run(self.eta))
 
     thread = threading.Thread(name="ChildThread", target=child_thread_job)
     thread.start()
     thread.join()
-    self.assertAllClose([-1.4], child_thread_result)
 
     dump_dirs = glob.glob(os.path.join(self.session_root, "run_*"))
     self.assertEqual(1, len(dump_dirs))
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index c36647b21c..bcd1e1d0dc 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -14,16 +14,11 @@ cc_library(
         "pywrap_tensor.cc",
         "pywrap_tfe_src.cc",
     ],
-    hdrs = [
-        "pywrap_tensor.h",
-        "pywrap_tfe.h",
-    ],
+    hdrs = ["pywrap_tfe.h"],
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/c:c_api",
-        "//tensorflow/c:c_api_internal",
         "//tensorflow/c/eager:c_api",
-        "//tensorflow/c/eager:c_api_internal",
         "//tensorflow/c/eager:tape",
         "//tensorflow/core:lib",
         "//tensorflow/python:ndarray_tensor",
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 0a92ab38a8..86b3776b8c 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -727,24 +727,12 @@ def _num_elements(grad):
   raise ValueError("`grad` not a Tensor or IndexedSlices.")
 
 
-_last_shape_dtype = [None, None]
-_last_zero = [None]
-
-
-def _zeros(shape, dtype):
-  """Wraps array_ops.zeros to cache last zero for a given shape and dtype."""
-  if [shape, dtype] != _last_shape_dtype:
-    _last_shape_dtype[:] = [shape, dtype]
-    _last_zero[0] = array_ops.zeros(shape, dtype)
-  return _last_zero[0]
-
-
 _default_vspace = imperative_grad.VSpace(
     num_elements_fn=_num_elements,
     aggregate_fn=_aggregate_grads,
     tensor_id=ops.tensor_id,
-    zeros=_zeros,
-    ones=array_ops.ones)
+    zeros=array_ops.zeros,
+    ones_like=lambda x: ops.convert_to_tensor(array_ops.ones_like(x)))
 
 
 class GradientTape(object):
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index ec9a185b73..ed54b8e12e 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -24,11 +24,11 @@ from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import custom_gradient
+from tensorflow.python.eager import imperative_grad
 from tensorflow.python.eager import tape
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
@@ -41,6 +41,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.training import training
+from tensorflow.python.util import compat
 
 
 class BackpropTest(test.TestCase):
@@ -102,18 +103,6 @@ class BackpropTest(test.TestCase):
     grad_fn = backprop.gradients_function(f)
     self.assertAllEqual(2., grad_fn(1., dy=2.)[0])
 
-  def testErrors(self):
-
-    @custom_gradient.custom_gradient
-    def f(x):
-      def grad(_):
-        raise RuntimeError('x')
-      return x, grad
-
-    # TODO(apassos) raise the right error here
-    with self.assertRaises(errors_impl.InternalError):
-      backprop.gradients_function(f)(constant_op.constant(1.0))
-
   def testImplicitGradOverEmbeddingLookup(self):
     batch_size = 8
     embedding_size = 512
@@ -494,6 +483,48 @@ class BackpropTest(test.TestCase):
         initial_value=1., name='testSameObjectForMultipleArguments.Variable')
     self.assertAllEqual([1., 1.], np_g(v, v))
 
+  def testEarlyGradAggregation(self):
+    # Needs to be a list so mutations by the callback affect this function.
+    add_n = []
+    def callback(op_type, unused_1, unused_2, unused_3, unused_4):
+      if compat.as_bytes(op_type) == compat.as_bytes('AddN'):
+        add_n.append(1)
+    context.context().add_post_execution_callback(callback)
+
+    v = resource_variable_ops.ResourceVariable(constant_op.constant(2.0),
+                                               name='v')
+    def fn():
+      outputs = []
+      for _ in range(20):
+        outputs.append(v * constant_op.constant(2.0))
+      return math_ops.add_n(outputs)
+
+    # By default the aggregation count is 2.
+    _ = backprop.implicit_grad(fn)()[0][1]
+    self.assertEqual(len(add_n), 2)
+    del add_n[:]
+
+    # Reduce the aggregation limit, cause the backprop to do some
+    # early aggregation.
+    # pylint: disable=protected-access
+    old_cnt = imperative_grad._MIN_AGGREGATE_COUNT
+    old_bytes = imperative_grad._MIN_AGGREGATE_BYTES
+    imperative_grad._MIN_AGGREGATE_COUNT = 10
+    imperative_grad._MIN_AGGREGATE_BYTES = 1
+    _ = backprop.implicit_grad(fn)()
+    self.assertEqual(len(add_n), 6)
+    del add_n[:]
+
+    # Aggregation is also limited by the memory.
+    imperative_grad._MIN_AGGREGATE_BYTES = 10000
+    _ = backprop.implicit_grad(fn)()
+    self.assertEqual(len(add_n), 2)
+
+    imperative_grad._MIN_AGGREGATE_COUNT = old_cnt
+    imperative_grad._MIN_AGGREGATE_BYTES = old_bytes
+    # pylint: enable=protected-access
+    context.context().clear_post_execution_callbacks()
+
   def testImplicitGradientsCustomGradientAndCachedVariableValue(self):
 
     @custom_gradient.custom_gradient
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index b555f16f1d..26a70a617d 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -66,8 +66,7 @@ class MicroBenchmarks(test.Benchmark):
       func()
     end = time.time()
     mean_us = (end - start) * 1e6 / num_iters
-    self.report_benchmark(iters=num_iters, wall_time=mean_us,
-                          extras={"examples_per_sec": num_iters/(end-start)})
+    self.report_benchmark(iters=num_iters, wall_time=mean_us)
 
   def benchmark_create_np_array(self):
     func = lambda: np.array([3.0])
diff --git a/tensorflow/python/eager/execute.py b/tensorflow/python/eager/execute.py
index c6457232e9..983c1ea73e 100644
--- a/tensorflow/python/eager/execute.py
+++ b/tensorflow/python/eager/execute.py
@@ -47,7 +47,8 @@ def execute(op_name, num_outputs, inputs, attrs, ctx, name=None):
     name: Customized name for the operation.
 
   Returns:
-    List of output Tensor objects. The list is empty if there are no outputs
+    None if there are no outputs, a single Tensor object if there is one output
+    and a list of Tensor objects if there are multiple outputs.
 
   Raises:
     An exception on error.
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 209715894e..243efccac4 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -62,21 +62,13 @@ class FunctionTest(test.TestCase):
     @function.defun
     def step():
       def inner():
+        tape.watch_variable(v)
         return v * v
 
       return backprop.implicit_grad(inner)()[0][0]
 
     self.assertAllEqual(step(), 2.0)
 
-  def testDefunDifferentiable(self):
-    v = resource_variable_ops.ResourceVariable(1.0)
-
-    @function.defun
-    def f():
-      return v * v
-
-    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
-
   def testGraphModeCaptureVariable(self):
     with context.graph_mode(), self.test_session() as sess:
 
diff --git a/tensorflow/python/eager/graph_callable.py b/tensorflow/python/eager/graph_callable.py
index ce51d17cfc..a7f1061d18 100644
--- a/tensorflow/python/eager/graph_callable.py
+++ b/tensorflow/python/eager/graph_callable.py
@@ -247,9 +247,7 @@ def _get_graph_callable_inputs(shape_and_dtypes):
       ret.append(_get_graph_callable_inputs(x))
     else:
       raise errors.InvalidArgumentError(
-          None, None, "Expected the argument to @graph_callable to be a "
-          "(possibly nested) list or tuple of ShapeAndDtype objects, "
-          "but got an object of type: %s" % type(x))
+          None, None, "shape_and_dtypes not ShapeAndDtype, type: %s " % type(x))
 
   return tuple(ret) if isinstance(shape_and_dtypes, tuple) else ret
 
@@ -269,7 +267,7 @@ def _graph_callable_internal(func, shape_and_dtypes):
 
   Args:
     func: The tfe Python function to compile.
-    shape_and_dtypes: A possibly nested list or tuple of ShapeAndDtype objects.
+    shape_and_dtypes: A list of type ShapeAndDtype.
 
   Raises:
     ValueError: If any one of func's outputs is not a Tensor.
@@ -432,10 +430,9 @@ def graph_callable(shape_and_dtypes):
   ret = foo(tfe.Tensor(2.0))  # `ret` here now is a Tensor with value 9.0.
   ```
   Args:
-    shape_and_dtypes: A possibly nested list or tuple of ShapeAndDtype objects
-      that specifies shape and type information for each of the callable's
-      arguments. The length of this list must be equal to the number of
-      arguments accepted by the wrapped function.
+    shape_and_dtypes: A list of type ShapeAndDtype that specifies shape and type
+      information for each of the callable's arguments. The length of this list
+      must be equal to the number of arguments accepted by the wrapped function.
 
   Returns:
     A callable graph object.
diff --git a/tensorflow/python/eager/imperative_grad.py b/tensorflow/python/eager/imperative_grad.py
index 837cad974a..c87719f84a 100644
--- a/tensorflow/python/eager/imperative_grad.py
+++ b/tensorflow/python/eager/imperative_grad.py
@@ -20,13 +20,114 @@ from __future__ import print_function
 
 import collections
 
-from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.framework import errors
+from tensorflow.python.eager import tape as tape_module
+
+
+# Terminology:
+#
+#  - op: a possibly composite operation, which has an entry in the tape
+#  - target: dy in dx/dy
+#  - source: dx in dx/dy
+#  - tensor: one of the many inputs or outputs of an operation
+#
+# Below here we do the gradient algorithm. It works as follows:
+#
+# First we filter the tape to just the subset of operations we want to
+# differentiate. In the process of doing so we count how many times each Tensor
+# is used as an input to an op (so we know when we're done computing gradients
+# for that Tensor). We also count, for each tape entry, how many of its output
+# Tensors need gradients to be computed (Tensors which are not used do not need
+# any gradients to be computed).
+#
+# Finally, we start a backprop stack with a set of tape entries for which we
+# have all gradients available. This set usually is a subset of the set of
+# targets (not all since targets which have outputs in the tape will not have
+# gradients available initially).
+#
+# Then we repeatedly pop an entry from the stack, run its backprop, and update
+# the gradients of its inputs. Once we have computed all gradients for a single
+# input we can mark this input as done, and this can trigger adding an entry to
+# the stack if all outputs of that entry are now done.
+#
+# When the stack is empty we have gradients for all tensors we're interested in.
+def _prepare_backprop(vspace, target, tensor_to_op, op_to_entry, id_sources):
+  """Filters the tape to only include relevant entries and counts tensor usages.
+
+  Args:
+    vspace: information about the space we're differentiating in.
+    target: the target to optimize.
+    tensor_to_op: Map from tensor id to key in op_to_entry that produced it.
+    op_to_entry: Map from op id to a tape.TapeEntry object
+    id_sources: the ids of the sources wrt the gradient is being taken.
+
+  Returns:
+    usage counts (how many entries downstream from a tensor use it)
+    op_to_entry_map: entry map (a filtered tape, with only the relevant
+     entries),
+    missing: map from tensor id to how many downstream gradients still need
+     to be computed before this tensor's gradient can be computed.
+  """
+  tensor_stack = [vspace.tensor_id(x) for x in target]
+  tensor_usage_counts = {}
+  o_to_e = {}  # Copy of just the bits we need from op_to_entry
+  while tensor_stack:
+    t = tensor_stack.pop()
+    op = tensor_to_op.get(t, None)
+    # op is None or -1 if the tensor is a source (i.e. was watched directly)
+    if op is None or op == -1 or op in o_to_e:
+      continue
+    op_trace = tape_module.TapeEntry(*op_to_entry[op])
+    o_to_e[op] = op_trace
+    for it in op_trace.input_ids:
+      if it in tensor_usage_counts:
+        tensor_usage_counts[it] += 1
+      else:
+        tensor_usage_counts[it] = 1
+        if it not in id_sources and it in tensor_to_op:
+          tensor_stack.append(it)
+  op_missing_tensor_counts = collections.defaultdict(int)
+  for t in tensor_usage_counts:
+    if t in tensor_to_op and tensor_to_op[t] is not None:
+      op_missing_tensor_counts[tensor_to_op[t]] += 1
+  return tensor_usage_counts, o_to_e, op_missing_tensor_counts
+
+
+def _initialize_backprop_stack(op_to_entry, op_missing_tensor):
+  """Returns the set of tape entries which are available for backprop."""
+  ready_ops = []
+  for op in op_to_entry:
+    if op not in op_missing_tensor:
+      ready_ops.append(op)
+  return ready_ops
+
+
+def _initial_gradients(vspace, target, output_gradients, tensor_usage_counts):
+  """Computes the initial gradients for each Tensor."""
+  # Initialize the backprop stack
+  gradients = collections.defaultdict(list)
+  for i, t in enumerate(target):
+    if vspace.tensor_id(t) in tensor_usage_counts:
+      # Can't provide a gradient of something we're trying to differentiate
+      assert output_gradients is None or output_gradients[i] is None
+    else:
+      if output_gradients is None or output_gradients[i] is None:
+        out_grad = vspace.ones_like(t)
+      else:
+        out_grad = output_gradients[i]
+      gradients[vspace.tensor_id(t)].append(out_grad)
+  return gradients
 
 
 VSpace = collections.namedtuple(
     "VSpace",
-    ["aggregate_fn", "num_elements_fn", "tensor_id", "zeros", "ones"])
+    ["aggregate_fn", "num_elements_fn", "tensor_id", "zeros", "ones_like"])
+
+
+# If over MIN_AGGREGATE_COUNT gradients are accumulated and the total
+# memory consumption is over MIN_AGGREGATE_BYTES, do an early aggregation
+# so as to release the gradient tensor to save memory.
+_MIN_AGGREGATE_COUNT = 4
+_MIN_AGGREGATE_BYTES = 128 * 1024 * 1024
 
 
 def imperative_grad(
@@ -60,6 +161,89 @@ def imperative_grad(
      or if only non-differentiable functions of the source were used in the
      computation of target.
   """
-  with errors.raise_exception_on_not_ok_status() as status:
-    return pywrap_tensorflow.TFE_Py_TapeGradient(
-        tape._tape, vspace, target, sources, output_gradients, status)  # pylint: disable=protected-access
+  tensor_to_op, op_to_entry = tape.export()
+  # This overwrites the op_to_entry variable, which will release all memory used
+  # to keep traces that are irrelevant to the gradient computation we're doing
+  # here.
+  id_sources = [vspace.tensor_id(t) for t in sources]
+  tensor_usage_counts, op_to_entry, op_missing_tensor = _prepare_backprop(
+      vspace, target, tensor_to_op, op_to_entry, id_sources)
+  ready_ops = _initialize_backprop_stack(op_to_entry, op_missing_tensor)
+  gradients = _initial_gradients(vspace, target, output_gradients,
+                                 tensor_usage_counts)
+  gradients_size = dict()
+  # Now exhaust the backprop stack
+  while ready_ops:
+    op = ready_ops.pop()
+    op_trace = op_to_entry.pop(op)
+    out_gradients = [gradients.pop(t, None) for t in op_trace.output_ids]
+
+    # Cache the last used zero tensor. We reuse it if the next one
+    # we need is of the same shape and dtype. This is very helpful in
+    # large splits and should have negligible overhead in other cases.
+    last_shape_and_dtype = None
+    last_zeros = None
+    for i in range(len(out_gradients)):
+      if out_gradients[i] is None:
+        # TODO(apassos) this should be in the right device
+        none_indices = _grad_fn_accepts_none_for_indices.get(
+            op_trace.op_type, None)
+        if none_indices is None or i not in none_indices:
+          shape_and_dtype = op_trace.output_shape_and_dtype[i]
+          if shape_and_dtype != last_shape_and_dtype:
+            last_shape_and_dtype = shape_and_dtype
+            last_zeros = vspace.zeros(*shape_and_dtype)
+          out_gradients[i] = last_zeros
+      else:
+        out_gradients[i] = vspace.aggregate_fn(out_gradients[i])
+
+    in_gradients = op_trace.backward_function(*(out_gradients))
+    for i, t in enumerate(op_trace.input_ids):
+      if in_gradients[i] is not None:
+        t_grads = gradients.setdefault(t, [])
+        t_grads.append(in_gradients[i])
+        if len(t_grads) >= _MIN_AGGREGATE_COUNT:
+          if t not in gradients_size:
+            gradients_size[t] = vspace.num_elements_fn(t_grads[-1])
+          size = gradients_size[t]
+
+          if len(t_grads) * size * 4 > _MIN_AGGREGATE_BYTES:
+            t_grads[:] = [vspace.aggregate_fn(t_grads)]
+      if tensor_usage_counts.get(t, 0) > 0:
+        tensor_usage_counts[t] -= 1
+        if (t in tensor_to_op
+            and tensor_usage_counts[t] == 0
+            and t not in id_sources):
+          in_op = tensor_to_op[t]
+          if in_op is None or in_op == -1:
+            continue
+          if op_missing_tensor.get(in_op, 0) > 0:
+            op_missing_tensor[in_op] -= 1
+            if op_missing_tensor.get(in_op, 0) == 0:
+              ready_ops.append(in_op)
+  result = []
+  for i, s in enumerate(sources):
+    g = gradients.get(vspace.tensor_id(s), None)
+    if g is None:
+      result.append(None)
+    else:
+      result.append(vspace.aggregate_fn(g))
+  return result
+
+
+# TODO(agarwal): use an automatic mechanism for handling None arguments to
+# gradient functions.
+# Some gradient functions can accept None arguments for gradients. The following
+# maps the operation name to the indices at which the corresponding gradient
+# function can accept None values.
+# e.g. FusedBatchNorm outputs 5 values and hence receives 5 gradient values
+# during backprop. However the gradient function uses only the first of those
+# values and ignores the rest. The entry, "FusedBatchNorm": [1, 2, 3, 4],
+# indicates that only the gradient corresponding to index 0 is used, and the
+# gradient values at indices 1-4 are ignored (and hence can be None). The
+# backprop algorithm can then leverage this by not constructing zeros to
+# pass for those indices.
+_grad_fn_accepts_none_for_indices = {
+    "SoftmaxCrossEntropyWithLogits": [1],
+    "FusedBatchNorm": [1, 2, 3, 4]
+}
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 653f3ef84e..ca283862f9 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/python/lib/core/py_seq_tensor.h"
 #include "tensorflow/python/lib/core/safe_ptr.h"
 
-#include "tensorflow/python/eager/pywrap_tensor.h"
 #include "tensorflow/python/eager/pywrap_tfe.h"
 
 #include "tensorflow/c/c_api.h"
@@ -574,7 +573,7 @@ bool EagerTensor_CheckExact(const PyObject* o) {
   return Py_TYPE(o) == EagerTensorType;
 }
 
-TFE_TensorHandle* EagerTensor_Handle(const PyObject* o) {
+TFE_TensorHandle* EagerTensorHandle(const PyObject* o) {
   return reinterpret_cast<const EagerTensor*>(o)->handle;
 }
 
@@ -595,11 +594,6 @@ PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle) {
   return reinterpret_cast<PyObject*>(t);
 }
 
-tensorflow::int64 EagerTensor_id(const PyObject* tensor) {
-  CHECK(EagerTensor_CheckExact(tensor));
-  return reinterpret_cast<const EagerTensor*>(tensor)->id;
-}
-
 PyObject* TFE_Py_InitEagerTensor(PyObject* base_class) {
   if (!PyType_Check(base_class)) {
     PyErr_SetString(
diff --git a/tensorflow/python/eager/pywrap_tensor.h b/tensorflow/python/eager/pywrap_tensor.h
deleted file mode 100644
index aa1efdd1b8..0000000000
--- a/tensorflow/python/eager/pywrap_tensor.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_PYTHON_EAGER_PYWRAP_TENSOR_H_
-#define TENSORFLOW_PYTHON_EAGER_PYWRAP_TENSOR_H_
-
-#include "tensorflow/c/eager/c_api.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/python/lib/core/numpy.h"
-
-bool EagerTensor_CheckExact(const PyObject* o);
-tensorflow::int64 EagerTensor_id(const PyObject* tensor);
-
-#endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TENSOR_H_
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index 6705483f3b..1d03df2933 100644
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -81,7 +81,7 @@ bool EagerTensor_CheckExact(const PyObject* o);
 PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle);
 
 // Extracts the handle inside EagerTensor object `o`. Returns nullptr on error.
-TFE_TensorHandle* EagerTensor_Handle(const PyObject* o);
+TFE_TensorHandle* EagerTensorHandle(const PyObject* o);
 
 // Creates the `EagerTensor` class by subclassing `base_class` and returns the
 // newly created type, or nullptr on error.
@@ -103,16 +103,7 @@ void TFE_Py_TapeRecordOperation(PyObject* tape, PyObject* op_type,
                                 PyObject* output_tensors,
                                 PyObject* input_tensor_ids,
                                 PyObject* backward_function);
-
-// Computes a gradient based on information recorded on the tape.`tape` must
-// have been produced by TFE_Py_NewTape. `vspace` must be a
-// imperative_grad.py:VSpace named tuple. `target` and `sources` must be python
-// lists of Tensor objects. `output_gradients` is either None or a python list
-// of either Tensor or None, and if not None should have the same length as
-// target.
-PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* vspace,
-                              PyObject* target, PyObject* sources,
-                              PyObject* output_gradients, TF_Status* status);
+PyObject* TFE_Py_TapeExport(PyObject* tape);
 
 // Returns an EagerTensor of dimension [len(`tensor_list`)] containing
 // the `slice_dim`'th dimension of each tensor in `tensor_list`. In other words,
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 77b49be8f8..7456eb10f8 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -16,13 +16,10 @@ limitations under the License.
 #include "tensorflow/python/eager/pywrap_tfe.h"
 
 #include "tensorflow/c/c_api.h"
-#include "tensorflow/c/c_api_internal.h"
-#include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/tape.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/python/eager/pywrap_tensor.h"
 
 using tensorflow::string;
 
@@ -443,12 +440,10 @@ void TFE_DeleteContextCapsule(PyObject* context) {
   TF_DeleteStatus(status);
 }
 
-using GradientTape = tensorflow::eager::GradientTape<PyObject, PyObject>;
-
 typedef struct {
   PyObject_HEAD
       /* Type-specific fields go here. */
-      GradientTape* tape;
+      tensorflow::eager::GradientTape* tape;
 } TFE_Py_Tape;
 
 static void TFE_Py_Tape_Delete(PyObject* tape) {
@@ -483,7 +478,7 @@ PyObject* TFE_Py_NewTape() {
   TFE_Py_Tape_Type.tp_new = PyType_GenericNew;
   if (PyType_Ready(&TFE_Py_Tape_Type) < 0) return nullptr;
   TFE_Py_Tape* tape = PyObject_NEW(TFE_Py_Tape, &TFE_Py_Tape_Type);
-  tape->tape = new GradientTape();
+  tape->tape = new tensorflow::eager::GradientTape();
   return reinterpret_cast<PyObject*>(tape);
 }
 
@@ -520,50 +515,18 @@ static std::vector<tensorflow::int64> MakeIntList(PyObject* list) {
 }
 
 PyObject* TFE_Py_TapeShouldRecord(PyObject* py_tape, PyObject* tensors) {
-  if (tensors == Py_None) {
-    Py_RETURN_FALSE;
-  }
-  PyObject* seq = PySequence_Fast(tensors, "expected a sequence");
-  if (seq == nullptr) {
-    return nullptr;
-  }
-  int len = PySequence_Fast_GET_SIZE(seq);
-  // TODO(apassos) consider not building a list and changing the API to check
-  // each tensor individually.
-  std::vector<tensorflow::int64> tensor_ids;
-  tensor_ids.reserve(len);
-  for (int i = 0; i < len; ++i) {
-    PyObject* item = PySequence_Fast_GET_ITEM(seq, i);
-    if (EagerTensor_CheckExact(item)) {
-      tensor_ids.push_back(EagerTensor_id(item));
-    } else {
-      PyObject* id_field = PyObject_GetAttrString(item, "_id");
-      if (id_field == nullptr) {
-        return nullptr;
-      }
-      tensor_ids.push_back(MakeInt(id_field));
-      Py_DECREF(id_field);
-    }
-  }
-  Py_DECREF(seq);
   TFE_Py_Tape* tape = reinterpret_cast<TFE_Py_Tape*>(py_tape);
-  if (tape->tape->ShouldRecord(tensor_ids)) {
-    Py_RETURN_TRUE;
-  } else {
-    Py_RETURN_FALSE;
-  }
+  return PyBool_FromLong(tape->tape->ShouldRecord(MakeIntList(tensors)));
 }
 
 void TFE_Py_TapeWatch(PyObject* tape, tensorflow::int64 tensor_id) {
   reinterpret_cast<TFE_Py_Tape*>(tape)->tape->Watch(tensor_id);
 }
 
+// TODO(apassos) have a fast path for eager tensors here which gets information
+// from the handle instead of from the python object, and use this only for the
+// case of graph tensors.
 static tensorflow::eager::TapeTensor TapeTensorFromTensor(PyObject* tensor) {
-  if (EagerTensor_CheckExact(tensor)) {
-    TFE_TensorHandle* t = EagerTensor_Handle(tensor);
-    tensorflow::int64 id = EagerTensor_id(tensor);
-    return tensorflow::eager::TapeTensor{id, t->t.dtype(), t->t.shape()};
-  }
   PyObject* id_field = PyObject_GetAttrString(tensor, "_id");
   tensorflow::int64 id = MakeInt(id_field);
   Py_DECREF(id_field);
@@ -629,239 +592,64 @@ void TFE_Py_TapeDeleteTrace(PyObject* tape, tensorflow::int64 tensor_id) {
   reinterpret_cast<TFE_Py_Tape*>(tape)->tape->DeleteTrace(tensor_id);
 }
 
-class PyVSpace : public tensorflow::eager::VSpace<PyObject, PyObject> {
- public:
-  explicit PyVSpace(PyObject* py_vspace) : py_vspace_(py_vspace) {}
-
-  tensorflow::Status Initialize() {
-    num_elements_ = PyObject_GetAttrString(py_vspace_, "num_elements_fn");
-    if (num_elements_ == nullptr) {
-      return tensorflow::errors::InvalidArgument("invalid vspace");
+// TODO(apassos) when backprop.py moves to C most of this exporting logic can
+// disappear.
+PyObject* TFE_Py_TapeExport(PyObject* tape) {
+  std::pair<tensorflow::eager::TensorTape, tensorflow::eager::OpTape> exported =
+      reinterpret_cast<TFE_Py_Tape*>(tape)->tape->Export();
+  PyObject* tensor_tape = PyDict_New();
+  for (const auto& pair : exported.first) {
+    PyObject* tid = PyLong_FromLong(pair.first);
+    PyObject* opid = PyLong_FromLong(pair.second);
+    PyDict_SetItem(tensor_tape, tid, opid);
+    Py_DECREF(tid);
+    Py_DECREF(opid);
+  }
+
+  PyObject* op_tape = PyDict_New();
+  for (const auto& pair : exported.second) {
+    PyObject* opid = PyLong_FromLong(pair.first);
+    const auto& entry = pair.second;
+    PyObject* op_type = PyBytes_FromString(entry.op_type.c_str());
+    PyObject* output_ids = PyList_New(entry.output_tensor_info.size());
+    for (int i = 0; i < entry.output_tensor_info.size(); ++i) {
+      PyObject* tid = PyLong_FromLong(entry.output_tensor_info[i].id);
+      PyList_SET_ITEM(output_ids, i, tid);
     }
-    aggregate_fn_ = PyObject_GetAttrString(py_vspace_, "aggregate_fn");
-    if (aggregate_fn_ == nullptr) {
-      return tensorflow::errors::InvalidArgument("invalid vspace");
+    PyObject* input_ids = PyList_New(entry.input_tensor_id.size());
+    for (int i = 0; i < entry.input_tensor_id.size(); ++i) {
+      PyObject* tid = PyLong_FromLong(entry.input_tensor_id[i]);
+      PyList_SET_ITEM(input_ids, i, tid);
     }
-    zeros_ = PyObject_GetAttrString(py_vspace_, "zeros");
-    if (zeros_ == nullptr) {
-      return tensorflow::errors::InvalidArgument("invalid vspace");
-    }
-    ones_ =
-        PyObject_GetAttrString(reinterpret_cast<PyObject*>(py_vspace_), "ones");
-    if (ones_ == nullptr) {
-      return tensorflow::errors::InvalidArgument("invalid vspace");
-    }
-    return tensorflow::Status::OK();
-  }
-
-  ~PyVSpace() override {
-    Py_XDECREF(num_elements_);
-    Py_XDECREF(aggregate_fn_);
-    Py_XDECREF(zeros_);
-    Py_XDECREF(ones_);
-  }
-
-  tensorflow::int64 NumElements(PyObject* tensor) const final {
-    PyObject* arglist =
-        Py_BuildValue("(O)", reinterpret_cast<PyObject*>(tensor));
-    PyObject* result = PyEval_CallObject(num_elements_, arglist);
-    tensorflow::int64 r = MakeInt(result);
-    Py_DECREF(result);
-    Py_DECREF(arglist);
-    return r;
-  }
-
-  PyObject* AggregateGradients(
-      tensorflow::gtl::ArraySlice<PyObject*> gradient_tensors) const final {
-    PyObject* list = PyList_New(gradient_tensors.size());
-    for (int i = 0; i < gradient_tensors.size(); ++i) {
-      // Note: stealing a reference to the gradient tensors.
-      CHECK(gradient_tensors[i] != nullptr);
-      CHECK(gradient_tensors[i] != Py_None);
-      PyList_SET_ITEM(list, i,
-                      reinterpret_cast<PyObject*>(gradient_tensors[i]));
-    }
-    PyObject* arglist = Py_BuildValue("(O)", list);
-    CHECK(arglist != nullptr);
-    PyObject* result = PyEval_CallObject(aggregate_fn_, arglist);
-    Py_DECREF(arglist);
-    Py_DECREF(list);
-    return result;
-  }
-
-  PyObject* Zeros(tensorflow::TensorShape shape,
-                  tensorflow::DataType dtype) const final {
-    PyObject* py_shape = PyTuple_New(shape.dims());
-    for (int i = 0; i < shape.dims(); ++i) {
-      PyTuple_SET_ITEM(py_shape, i, PyLong_FromLong(shape.dim_size(i)));
-    }
-    PyObject* py_dtype = PyLong_FromLong(static_cast<int>(dtype));
-    PyObject* arg_list = Py_BuildValue("OO", py_shape, py_dtype);
-    PyObject* result = PyEval_CallObject(zeros_, arg_list);
-    Py_DECREF(arg_list);
-    Py_DECREF(py_dtype);
-    Py_DECREF(py_shape);
-    return reinterpret_cast<PyObject*>(result);
-  }
-
-  PyObject* Ones(tensorflow::TensorShape shape,
-                 tensorflow::DataType dtype) const final {
-    PyObject* py_shape = PyTuple_New(shape.dims());
-    for (int i = 0; i < shape.dims(); ++i) {
-      PyTuple_SET_ITEM(py_shape, i, PyLong_FromLong(shape.dim_size(i)));
-    }
-    PyObject* py_dtype = PyLong_FromLong(static_cast<int>(dtype));
-    PyObject* arg_list = Py_BuildValue("OO", py_shape, py_dtype);
-    PyObject* result = PyEval_CallObject(ones_, arg_list);
-    Py_DECREF(arg_list);
-    Py_DECREF(py_dtype);
-    Py_DECREF(py_shape);
-    return result;
-  }
-
-  tensorflow::Status CallBackwardFunction(
-      PyObject* backward_function,
-      tensorflow::gtl::ArraySlice<PyObject*> output_gradients,
-      std::vector<PyObject*>* result) const final {
-    PyObject* grads = PyTuple_New(output_gradients.size());
-    for (int i = 0; i < output_gradients.size(); ++i) {
-      if (output_gradients[i] == nullptr) {
-        Py_INCREF(Py_None);
-        PyTuple_SET_ITEM(grads, i, Py_None);
-      } else {
-        PyTuple_SET_ITEM(grads, i,
-                         reinterpret_cast<PyObject*>(output_gradients[i]));
+    PyObject* backward_function =
+        reinterpret_cast<PyObject*>(entry.backward_function);
+    PyObject* output_shape_and_dtype =
+        PyList_New(entry.output_tensor_info.size());
+    for (int i = 0; i < entry.output_tensor_info.size(); ++i) {
+      const tensorflow::TensorShape& shape = entry.output_tensor_info[i].shape;
+      PyObject* shape_list = PyList_New(shape.dims());
+      for (int j = 0; j < shape.dims(); ++j) {
+        PyList_SET_ITEM(shape_list, j, PyLong_FromLong(shape.dim_size(j)));
       }
+      PyObject* type_enum = PyLong_FromLong(entry.output_tensor_info[i].dtype);
+      PyObject* tuple = PyTuple_Pack(2, shape_list, type_enum);
+      Py_DECREF(shape_list);
+      Py_DECREF(type_enum);
+      PyList_SET_ITEM(output_shape_and_dtype, i, tuple);
     }
-    PyObject* py_result = PyEval_CallObject(
-        reinterpret_cast<PyObject*>(backward_function), grads);
-    Py_DECREF(grads);
+    PyObject* opinfo = PyTuple_Pack(5, op_type, output_ids, input_ids,
+                                    backward_function, output_shape_and_dtype);
+    Py_DECREF(op_type);
+    Py_DECREF(output_ids);
+    Py_DECREF(input_ids);
     Py_DECREF(backward_function);
-    if (py_result == nullptr) {
-      VLOG(1) << "Gradient function threw exceptions";
-      if (VLOG_IS_ON(1)) {
-        PyErr_Print();
-      }
-      return tensorflow::errors::Internal("gradient function threw exceptions");
-    }
-    result->clear();
-    PyObject* seq =
-        PySequence_Fast(py_result, "expected a sequence of gradients");
-    if (seq == nullptr) {
-      return tensorflow::errors::InvalidArgument(
-          "gradient function did not return a list");
-    }
-    int len = PySequence_Fast_GET_SIZE(seq);
-    VLOG(1) << "Gradient length is " << len;
-    result->reserve(len);
-    for (int i = 0; i < len; ++i) {
-      PyObject* item = PySequence_Fast_GET_ITEM(seq, i);
-      if (item == Py_None) {
-        result->push_back(nullptr);
-      } else {
-        Py_INCREF(item);
-        result->push_back(item);
-      }
-    }
-    Py_DECREF(seq);
-    Py_DECREF(py_result);
-    return tensorflow::Status::OK();
-  }
-
-  void DeleteGradient(PyObject* tensor) const final { Py_XDECREF(tensor); }
-
- private:
-  PyObject* py_vspace_;
-
-  PyObject* num_elements_;
-  PyObject* aggregate_fn_;
-  PyObject* zeros_;
-  PyObject* ones_;
-};
-
-std::vector<PyObject*> MakeTensorList(PyObject* tensors) {
-  PyObject* seq = PySequence_Fast(tensors, "expected a sequence");
-  if (seq == nullptr) {
-    return {};
-  }
-  int len = PySequence_Fast_GET_SIZE(seq);
-  std::vector<PyObject*> list;
-  list.reserve(len);
-  for (int i = 0; i < len; ++i) {
-    list.push_back(PySequence_Fast_GET_ITEM(seq, i));
-  }
-  Py_DECREF(seq);
-  return list;
-}
-
-std::vector<tensorflow::int64> MakeTensorIDList(PyObject* tensors) {
-  PyObject* seq = PySequence_Fast(tensors, "expected a sequence");
-  if (seq == nullptr) {
-    return {};
-  }
-  int len = PySequence_Fast_GET_SIZE(seq);
-  std::vector<tensorflow::int64> list;
-  list.reserve(len);
-  for (int i = 0; i < len; ++i) {
-    PyObject* tensor = PySequence_Fast_GET_ITEM(seq, i);
-    if (EagerTensor_CheckExact(tensor)) {
-      list.push_back(EagerTensor_id(tensor));
-    } else {
-      PyObject* id_field = PyObject_GetAttrString(tensor, "_id");
-      list.push_back(MakeInt(id_field));
-      Py_DECREF(id_field);
-    }
-  }
-  Py_DECREF(seq);
-  return list;
-}
-
-PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* vspace,
-                              PyObject* target, PyObject* sources,
-                              PyObject* output_gradients, TF_Status* status) {
-  PyVSpace c_vspace(vspace);
-  if (!c_vspace.Initialize().ok()) {
-    return nullptr;
-  }
-
-  std::vector<tensorflow::int64> target_vec = MakeTensorIDList(target);
-  if (PyErr_Occurred()) {
-    return nullptr;
-  }
-  std::vector<tensorflow::int64> sources_vec = MakeTensorIDList(sources);
-  if (PyErr_Occurred()) {
-    return nullptr;
-  }
-  std::vector<PyObject*> outgrad_vec;
-  if (output_gradients != Py_None) {
-    outgrad_vec = MakeTensorList(output_gradients);
-    if (PyErr_Occurred()) {
-      return nullptr;
-    }
-    for (PyObject* tensor : outgrad_vec) {
-      // Calling the backward function will eat a reference to the tensors in
-      // outgrad_vec, so we need to increase their reference count.
-      Py_INCREF(tensor);
-    }
-  }
-  TFE_Py_Tape* tape_obj = reinterpret_cast<TFE_Py_Tape*>(tape);
-  std::vector<PyObject*> result;
-  status->status = tape_obj->tape->ComputeGradient(
-      c_vspace, target_vec, sources_vec, outgrad_vec, &result);
-  if (!status->status.ok()) {
-    return nullptr;
-  }
-  if (!result.empty()) {
-    PyObject* py_result = PyList_New(result.size());
-    for (int i = 0; i < result.size(); ++i) {
-      if (result[i] == nullptr) {
-        Py_INCREF(Py_None);
-        result[i] = Py_None;
-      }
-      PyList_SET_ITEM(py_result, i, reinterpret_cast<PyObject*>(result[i]));
-    }
-    return py_result;
-  }
-  Py_INCREF(Py_None);
-  return Py_None;
+    Py_DECREF(output_shape_and_dtype);
+    PyDict_SetItem(op_tape, opid, opinfo);
+    Py_DECREF(opid);
+    Py_DECREF(opinfo);
+  }
+  PyObject* retval = PyTuple_Pack(2, tensor_tape, op_tape);
+  Py_DECREF(tensor_tape);
+  Py_DECREF(op_tape);
+  return retval;
 }
diff --git a/tensorflow/python/eager/tape.py b/tensorflow/python/eager/tape.py
index a06f5e1a67..c16aa8c2f7 100644
--- a/tensorflow/python/eager/tape.py
+++ b/tensorflow/python/eager/tape.py
@@ -72,7 +72,7 @@ class Tape(object):
       True if any of the tensors is in the tape.
     """
     return pywrap_tensorflow.TFE_Py_TapeShouldRecord(
-        self._tape, tensors)
+        self._tape, [x._id  for x in tensors])  # pylint: disable=protected-access
 
   def watch(self, tensor):
     """Adds a tensor to the tape."""
@@ -99,6 +99,16 @@ class Tape(object):
     """Deletes any trace we have for this tensor."""
     self._delete_tensor_id(tensor_id)
 
+  def export(self):
+    """Exports the internal state of this tape.
+
+    Returns:
+      tensor_tape: a map from tensor_id(tensor) to <identifier for op>
+       responsible for generating that tensor.
+      op_tape: a map from <identifier for op> to TapeEntry for that op.
+    """
+    return pywrap_tensorflow.TFE_Py_TapeExport(self._tape)
+
 
 class _TapeStack(threading.local):
 
diff --git a/tensorflow/python/eager/tape_test.py b/tensorflow/python/eager/tape_test.py
index b490bac66d..c97cb62125 100644
--- a/tensorflow/python/eager/tape_test.py
+++ b/tensorflow/python/eager/tape_test.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import custom_gradient
+from tensorflow.python.eager import tape
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -165,6 +166,25 @@ class TapeTest(test.TestCase):
     g, = backprop.gradients_function(fn, [0])(t)
     self.assertAllEqual(g, 1.0)
 
+  def testTapeGC(self):
+    # TODO(apassos) figure out how to test this without using tape internal
+    # APIs.
+    tape.push_new_tape()
+
+    def f():
+      x = constant_op.constant(1.0)
+      tape.watch(x)
+      x = gradient_is_constant(x)
+      x = gradient_is_constant(x)
+      x = gradient_is_constant(x)
+
+    f()
+    t = tape.pop_tape()
+    tensor_tape, op_tape = t.export()
+    self.assertEqual(len(tensor_tape), 1)  # The watched tensor will remain on
+                                           # the tape
+    self.assertEqual(len(op_tape), 0)  # No operations should remain on the tape
+
   def testCustomGradientGraphMode(self):
     with context.graph_mode(), self.test_session():
 
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 03f386e9cf..26f1fd888a 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -25,7 +25,6 @@ py_library(
     srcs = ["estimator_lib.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":baseline",
         ":dnn",
         ":dnn_linear_combined",
         ":estimator",
@@ -188,68 +187,6 @@ py_test(
 )
 
 py_library(
-    name = "baseline",
-    srcs = ["canned/baseline.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":estimator",
-        ":head",
-        ":model_fn",
-        ":optimizers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:nn",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/feature_column",
-        "@six_archive//:six",
-    ],
-)
-
-py_test(
-    name = "baseline_test",
-    size = "medium",
-    srcs = ["canned/baseline_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "notsan",  # b/67510291
-    ],
-    deps = [
-        ":baseline",
-        ":estimator",
-        ":export_export",
-        ":metric_keys",
-        ":numpy_io",
-        ":pandas_io",
-        ":run_config",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/feature_column",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
     name = "dnn",
     srcs = ["canned/dnn.py"],
     srcs_version = "PY2AND3",
diff --git a/tensorflow/python/estimator/canned/baseline.py b/tensorflow/python/estimator/canned/baseline.py
deleted file mode 100644
index 96e4ecd29f..0000000000
--- a/tensorflow/python/estimator/canned/baseline.py
+++ /dev/null
@@ -1,349 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Baseline estimators.
-
-Baseline estimators are bias-only estimators that can be used for debugging
-and as simple baselines.
-
-Example:
-
-```
-# Build BaselineClassifier
-classifier = BaselineClassifier(n_classes=3)
-
-# Input builders
-def input_fn_train: # returns x, y (where y represents label's class index).
-  pass
-
-def input_fn_eval: # returns x, y (where y represents label's class index).
-  pass
-
-# Fit model.
-classifier.train(input_fn=input_fn_train)
-
-# Evaluate cross entropy between the test and train labels.
-loss = classifier.evaluate(input_fn=input_fn_eval)["loss"]
-
-# predict outputs the probability distribution of the classes as seen in
-# training.
-predictions = classifier.predict(new_samples)
-```
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import six
-
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator.canned import head as head_lib
-from tensorflow.python.estimator.canned import optimizers
-from tensorflow.python.feature_column import feature_column as feature_column_lib
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import training_util
-
-# The default learning rate of 0.3 is a historical artifact of the initial
-# implementation, but seems a reasonable choice.
-_LEARNING_RATE = 0.3
-
-
-def _get_weight_column_key(weight_column):
-  if weight_column is None:
-    return None
-  if isinstance(weight_column, six.string_types):
-    return weight_column
-  if not isinstance(weight_column, feature_column_lib._NumericColumn):  # pylint: disable=protected-access
-    raise TypeError('Weight column must be either a string or _NumericColumn.'
-                    ' Given type: {}.'.format(type(weight_column)))
-  return weight_column.key()
-
-
-def _baseline_logit_fn_builder(num_outputs, weight_column=None):
-  """Function builder for a baseline logit_fn.
-
-  Args:
-    num_outputs: Number of outputs for the model.
-    weight_column: A string or a `_NumericColumn` created by
-      `tf.feature_column.numeric_column` defining feature column representing
-       weights. It will be multiplied by the loss of the example.
-  Returns:
-    A logit_fn (see below).
-  """
-
-  def baseline_logit_fn(features):
-    """Baseline model logit_fn.
-
-    The baseline model simply learns a bias, so the output logits are a
-    `Variable` with one weight for each output that learns the bias for the
-    corresponding output.
-
-    Args:
-      features: The first item returned from the `input_fn` passed to `train`,
-        `evaluate`, and `predict`. This should be a single `Tensor` or dict with
-        `Tensor` values.
-    Returns:
-      A `Tensor` representing the logits.
-    """
-    size_checks = []
-    batch_size = None
-
-    weight_column_key = _get_weight_column_key(weight_column)
-
-    # The first dimension is assumed to be a batch size and must be consistent
-    # among all of the features.
-    for key, feature in features.items():
-      # Skip weight_column to ensure we don't add size checks to it.
-      # These would introduce a dependency on the weight at serving time.
-      if key == weight_column_key:
-        continue
-      first_dim = array_ops.shape(feature)[0]
-      if batch_size is None:
-        batch_size = first_dim
-      else:
-        size_checks.append(check_ops.assert_equal(batch_size, first_dim))
-
-    with ops.control_dependencies(size_checks):
-      with variable_scope.variable_scope('baseline'):
-        bias = variable_scope.get_variable('bias', shape=[num_outputs],
-                                           initializer=init_ops.Zeros)
-        return math_ops.multiply(bias, array_ops.ones([batch_size,
-                                                       num_outputs]))
-
-  return baseline_logit_fn
-
-
-def _baseline_model_fn(features, labels, mode, head, optimizer,
-                       weight_column=None, config=None):
-  """Model_fn for baseline models.
-
-  Args:
-    features: `Tensor` or dict of `Tensor` (depends on data passed to `train`).
-    labels: `Tensor` of labels that are compatible with the `Head` instance.
-    mode: Defines whether this is training, evaluation or prediction.
-      See `ModeKeys`.
-    head: A `Head` instance.
-    optimizer: String, `tf.Optimizer` object, or callable that creates the
-      optimizer to use for training. If not specified, will use `FtrlOptimizer`
-      with a default learning rate of 0.3.
-    weight_column: A string or a `_NumericColumn` created by
-      `tf.feature_column.numeric_column` defining feature column representing
-       weights. It will be multiplied by the loss of the example.
-    config: `RunConfig` object to configure the runtime settings.
-
-  Raises:
-    KeyError: If weight column is specified but not present.
-    ValueError: If features is an empty dictionary.
-
-  Returns:
-    An `EstimatorSpec` instance.
-  """
-  del config  # Unused.
-
-  logit_fn = _baseline_logit_fn_builder(head.logits_dimension, weight_column)
-  logits = logit_fn(features)
-
-  def train_op_fn(loss):
-    opt = optimizers.get_optimizer_instance(
-        optimizer, learning_rate=_LEARNING_RATE)
-    return opt.minimize(loss, global_step=training_util.get_global_step())
-
-  return head.create_estimator_spec(
-      features=features,
-      mode=mode,
-      logits=logits,
-      labels=labels,
-      train_op_fn=train_op_fn)
-
-
-class BaselineClassifier(estimator.Estimator):
-  """A classifier that can establish a simple baseline.
-
-  This classifier ignores feature values and will learn to predict the average
-  value of each label. For single-label problems, this will predict the
-  probability distribution of the classes as seen in the labels. For multi-label
-  problems, this will predict the fraction of examples that are positive for
-  each class.
-
-  Example:
-
-  ```python
-
-  # Build BaselineClassifier
-  classifier = BaselineClassifier(n_classes=3)
-
-  # Input builders
-  def input_fn_train: # returns x, y (where y represents label's class index).
-    pass
-
-  def input_fn_eval: # returns x, y (where y represents label's class index).
-    pass
-
-  # Fit model.
-  classifier.train(input_fn=input_fn_train)
-
-  # Evaluate cross entropy between the test and train labels.
-  loss = classifier.evaluate(input_fn=input_fn_eval)["loss"]
-
-  # predict outputs the probability distribution of the classes as seen in
-  # training.
-  predictions = classifier.predict(new_samples)
-
-  ```
-
-  Input of `train` and `evaluate` should have following features,
-    otherwise there will be a `KeyError`:
-
-  * if `weight_column` is not `None`, a feature with
-     `key=weight_column` whose value is a `Tensor`.
-  """
-
-  def __init__(self,
-               model_dir=None,
-               n_classes=2,
-               weight_column=None,
-               label_vocabulary=None,
-               optimizer='Ftrl',
-               config=None):
-    """Initializes a BaselineClassifier instance.
-
-    Args:
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator to
-        continue training a previously saved model.
-      n_classes: number of label classes. Default is binary classification.
-        It must be greater than 1. Note: Class labels are integers representing
-        the class index (i.e. values from 0 to n_classes-1). For arbitrary
-        label values (e.g. string labels), convert to class indices first.
-      weight_column: A string or a `_NumericColumn` created by
-        `tf.feature_column.numeric_column` defining feature column representing
-         weights. It will be multiplied by the loss of the example.
-      label_vocabulary: Optional list of strings with size `[n_classes]`
-        defining the label vocabulary. Only supported for `n_classes` > 2.
-      optimizer: String, `tf.Optimizer` object, or callable that creates the
-        optimizer to use for training. If not specified, will use
-        `FtrlOptimizer` with a default learning rate of 0.3.
-      config: `RunConfig` object to configure the runtime settings.
-    Returns:
-      A `BaselineClassifier` estimator.
-
-    Raises:
-      ValueError: If `n_classes` < 2.
-    """
-    if n_classes == 2:
-      head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
-          weight_column=weight_column,
-          label_vocabulary=label_vocabulary)
-    else:
-      head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
-          n_classes, weight_column=weight_column,
-          label_vocabulary=label_vocabulary)
-    def _model_fn(features, labels, mode, config):
-      return _baseline_model_fn(
-          features=features,
-          labels=labels,
-          mode=mode,
-          head=head,
-          optimizer=optimizer,
-          weight_column=weight_column,
-          config=config)
-    super(BaselineClassifier, self).__init__(
-        model_fn=_model_fn,
-        model_dir=model_dir,
-        config=config)
-
-
-class BaselineRegressor(estimator.Estimator):
-  """A regressor that can establish a simple baseline.
-
-  This regressor ignores feature values and will learn to predict the average
-  value of each label.
-
-  Example:
-
-  ```python
-
-  # Build BaselineRegressor
-  regressor = BaselineRegressor()
-
-  # Input builders
-  def input_fn_train: # returns x, y (where y is the label).
-    pass
-
-  def input_fn_eval: # returns x, y (where y is the label).
-    pass
-
-  # Fit model.
-  regressor.train(input_fn=input_fn_train)
-
-  # Evaluate squared-loss between the test and train targets.
-  loss = regressor.evaluate(input_fn=input_fn_eval)["loss"]
-
-  # predict outputs the mean value seen during training.
-  predictions = regressor.predict(new_samples)
-  ```
-
-  Input of `train` and `evaluate` should have following features,
-    otherwise there will be a `KeyError`:
-
-  * if `weight_column` is not `None`, a feature with
-     `key=weight_column` whose value is a `Tensor`.
-  """
-
-  def __init__(self,
-               model_dir=None,
-               label_dimension=1,
-               weight_column=None,
-               optimizer='Ftrl',
-               config=None):
-    """Initializes a BaselineRegressor instance.
-
-    Args:
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator to
-        continue training a previously saved model.
-      label_dimension: Number of regression targets per example. This is the
-        size of the last dimension of the labels and logits `Tensor` objects
-        (typically, these have shape `[batch_size, label_dimension]`).
-      weight_column: A string or a `_NumericColumn` created by
-        `tf.feature_column.numeric_column` defining feature column representing
-         weights. It will be multiplied by the loss of the example.
-      optimizer: String, `tf.Optimizer` object, or callable that creates the
-        optimizer to use for training. If not specified, will use
-        `FtrlOptimizer` with a default learning rate of 0.3.
-      config: `RunConfig` object to configure the runtime settings.
-    Returns:
-      A `BaselineRegressor` estimator.
-    """
-
-    head = head_lib._regression_head_with_mean_squared_error_loss(  # pylint: disable=protected-access
-        label_dimension=label_dimension,
-        weight_column=weight_column)
-    def _model_fn(features, labels, mode, config):
-      return _baseline_model_fn(
-          features=features,
-          labels=labels,
-          mode=mode,
-          head=head,
-          optimizer=optimizer,
-          config=config)
-    super(BaselineRegressor, self).__init__(
-        model_fn=_model_fn,
-        model_dir=model_dir,
-        config=config)
diff --git a/tensorflow/python/estimator/canned/baseline_test.py b/tensorflow/python/estimator/canned/baseline_test.py
deleted file mode 100644
index 96639e88ea..0000000000
--- a/tensorflow/python/estimator/canned/baseline_test.py
+++ /dev/null
@@ -1,1545 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for baseline.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import os
-import shutil
-import tempfile
-
-import numpy as np
-import six
-
-from tensorflow.core.example import example_pb2
-from tensorflow.core.example import feature_pb2
-from tensorflow.python.client import session as tf_session
-from tensorflow.python.estimator.canned import baseline
-from tensorflow.python.estimator.canned import metric_keys
-from tensorflow.python.estimator.export import export
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.estimator.inputs import pandas_io
-from tensorflow.python.feature_column import feature_column as feature_column_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.summary.writer import writer_cache
-from tensorflow.python.training import checkpoint_utils
-from tensorflow.python.training import input as input_lib
-from tensorflow.python.training import optimizer
-from tensorflow.python.training import queue_runner
-from tensorflow.python.training import saver
-
-
-try:
-  # pylint: disable=g-import-not-at-top
-  import pandas as pd
-  HAS_PANDAS = True
-except IOError:
-  # Pandas writes a temporary file during import. If it fails, don't use pandas.
-  HAS_PANDAS = False
-except ImportError:
-  HAS_PANDAS = False
-
-# pylint rules which are disabled by default for test files.
-# pylint: disable=invalid-name,protected-access,missing-docstring
-
-# Names of variables created by model.
-BIAS_NAME = 'baseline/bias'
-
-
-def assert_close(expected, actual, rtol=1e-04, name='assert_close'):
-  with ops.name_scope(name, 'assert_close', (expected, actual, rtol)) as scope:
-    expected = ops.convert_to_tensor(expected, name='expected')
-    actual = ops.convert_to_tensor(actual, name='actual')
-    rdiff = math_ops.abs(expected - actual, 'diff') / math_ops.abs(expected)
-    rtol = ops.convert_to_tensor(rtol, name='rtol')
-    return check_ops.assert_less(
-        rdiff,
-        rtol,
-        data=('Condition expected =~ actual did not hold element-wise:'
-              'expected = ', expected, 'actual = ', actual, 'rdiff = ', rdiff,
-              'rtol = ', rtol,),
-        name=scope)
-
-
-def save_variables_to_ckpt(model_dir):
-  init_all_op = [variables.global_variables_initializer()]
-  with tf_session.Session() as sess:
-    sess.run(init_all_op)
-    saver.Saver().save(sess, os.path.join(model_dir, 'model.ckpt'))
-
-
-def queue_parsed_features(feature_map):
-  tensors_to_enqueue = []
-  keys = []
-  for key, tensor in six.iteritems(feature_map):
-    keys.append(key)
-    tensors_to_enqueue.append(tensor)
-  queue_dtypes = [x.dtype for x in tensors_to_enqueue]
-  input_queue = data_flow_ops.FIFOQueue(capacity=100, dtypes=queue_dtypes)
-  queue_runner.add_queue_runner(
-      queue_runner.QueueRunner(input_queue,
-                               [input_queue.enqueue(tensors_to_enqueue)]))
-  dequeued_tensors = input_queue.dequeue()
-  return {keys[i]: dequeued_tensors[i] for i in range(len(dequeued_tensors))}
-
-
-def sorted_key_dict(unsorted_dict):
-  return {k: unsorted_dict[k] for k in sorted(unsorted_dict)}
-
-
-def sigmoid(x):
-  return 1 / (1 + np.exp(-1.0 * x))
-
-
-def _baseline_regressor_fn(*args, **kwargs):
-  return baseline.BaselineRegressor(*args, **kwargs)
-
-
-def _baseline_classifier_fn(*args, **kwargs):
-  return baseline.BaselineClassifier(*args, **kwargs)
-
-
-# Tests for Baseline Regressor.
-
-
-# TODO(b/36813849): Add tests with dynamic shape inputs using placeholders.
-class BaselineRegressorEvaluationTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def test_evaluation_for_simple_data(self):
-    with ops.Graph().as_default():
-      variables.Variable([13.0], name=BIAS_NAME)
-      variables.Variable(
-          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    baseline_regressor = _baseline_regressor_fn(model_dir=self._model_dir)
-    eval_metrics = baseline_regressor.evaluate(
-        input_fn=lambda: ({'age': ((1,),)}, ((10.,),)), steps=1)
-
-    # Logit is bias = 13, while label is 10. Loss is 3**2 = 9.
-    self.assertDictEqual({
-        metric_keys.MetricKeys.LOSS: 9.,
-        metric_keys.MetricKeys.LOSS_MEAN: 9.,
-        ops.GraphKeys.GLOBAL_STEP: 100
-    }, eval_metrics)
-
-  def test_evaluation_batch(self):
-    """Tests evaluation for batch_size==2."""
-    with ops.Graph().as_default():
-      variables.Variable([13.0], name=BIAS_NAME)
-      variables.Variable(
-          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    baseline_regressor = _baseline_regressor_fn(model_dir=self._model_dir)
-    eval_metrics = baseline_regressor.evaluate(
-        input_fn=lambda: ({'age': ((1,), (1,))}, ((10.,), (10.,))), steps=1)
-
-    # Logit is bias = 13, while label is 10.
-    # Loss per example is 3**2 = 9.
-    # Training loss is the sum over batch = 9 + 9 = 18
-    # Average loss is the average over batch = 9
-    self.assertDictEqual({
-        metric_keys.MetricKeys.LOSS: 18.,
-        metric_keys.MetricKeys.LOSS_MEAN: 9.,
-        ops.GraphKeys.GLOBAL_STEP: 100
-    }, eval_metrics)
-
-  def test_evaluation_weights(self):
-    """Tests evaluation with weights."""
-    with ops.Graph().as_default():
-      variables.Variable([13.0], name=BIAS_NAME)
-      variables.Variable(
-          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    def _input_fn():
-      features = {'age': ((1,), (1,)), 'weights': ((1.,), (2.,))}
-      labels = ((10.,), (10.,))
-      return features, labels
-
-    baseline_regressor = _baseline_regressor_fn(
-        weight_column='weights',
-        model_dir=self._model_dir)
-    eval_metrics = baseline_regressor.evaluate(input_fn=_input_fn, steps=1)
-
-    # Logit is bias = 13, while label is 10.
-    # Loss per example is 3**2 = 9.
-    # Training loss is the weighted sum over batch = 9 + 2*9 = 27
-    # average loss is the weighted average = 9 + 2*9 / (1 + 2) = 9
-    self.assertDictEqual({
-        metric_keys.MetricKeys.LOSS: 27.,
-        metric_keys.MetricKeys.LOSS_MEAN: 9.,
-        ops.GraphKeys.GLOBAL_STEP: 100
-    }, eval_metrics)
-
-  def test_evaluation_for_multi_dimensions(self):
-    label_dim = 2
-    with ops.Graph().as_default():
-      variables.Variable([46.0, 58.0], name=BIAS_NAME)
-      variables.Variable(100, name='global_step', dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    baseline_regressor = _baseline_regressor_fn(
-        label_dimension=label_dim,
-        model_dir=self._model_dir)
-    input_fn = numpy_io.numpy_input_fn(
-        x={
-            'age': np.array([[2., 4., 5.]]),
-        },
-        y=np.array([[46., 58.]]),
-        batch_size=1,
-        num_epochs=None,
-        shuffle=False)
-    eval_metrics = baseline_regressor.evaluate(input_fn=input_fn, steps=1)
-
-    self.assertItemsEqual(
-        (metric_keys.MetricKeys.LOSS, metric_keys.MetricKeys.LOSS_MEAN,
-         ops.GraphKeys.GLOBAL_STEP), eval_metrics.keys())
-
-    # Logit is bias which is [46, 58]
-    self.assertAlmostEqual(0, eval_metrics[metric_keys.MetricKeys.LOSS])
-
-
-class BaselineRegressorPredictTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def test_1d(self):
-    """Tests predict when all variables are one-dimensional."""
-    with ops.Graph().as_default():
-      variables.Variable([.2], name=BIAS_NAME)
-      variables.Variable(100, name='global_step', dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    baseline_regressor = _baseline_regressor_fn(model_dir=self._model_dir)
-
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': np.array([[2.]])},
-        y=None,
-        batch_size=1,
-        num_epochs=1,
-        shuffle=False)
-    predictions = baseline_regressor.predict(input_fn=predict_input_fn)
-    predicted_scores = list([x['predictions'] for x in predictions])
-    # x * weight + bias = 2. * 10. + .2 = 20.2
-    self.assertAllClose([[.2]], predicted_scores)
-
-  def testMultiDim(self):
-    """Tests predict when all variables are multi-dimenstional."""
-    batch_size = 2
-    label_dimension = 3
-    with ops.Graph().as_default():
-      variables.Variable(  # shape=[label_dimension]
-          [.2, .4, .6], name=BIAS_NAME)
-      variables.Variable(100, name='global_step', dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    baseline_regressor = _baseline_regressor_fn(
-        label_dimension=label_dimension,
-        model_dir=self._model_dir)
-
-    predict_input_fn = numpy_io.numpy_input_fn(
-        # x shape=[batch_size, x_dim]
-        x={'x': np.array([[1., 2., 3., 4.], [5., 6., 7., 8.]])},
-        y=None,
-        batch_size=batch_size,
-        num_epochs=1,
-        shuffle=False)
-    predictions = baseline_regressor.predict(input_fn=predict_input_fn)
-    predicted_scores = list([x['predictions'] for x in predictions])
-    # score = bias, shape=[batch_size, label_dimension]
-    self.assertAllClose([[0.2, 0.4, 0.6], [0.2, 0.4, 0.6]],
-                        predicted_scores)
-
-
-class BaselineRegressorIntegrationTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _test_complete_flow(self, train_input_fn, eval_input_fn, predict_input_fn,
-                          input_dimension, label_dimension, prediction_length):
-    feature_columns = [
-        feature_column_lib.numeric_column('x', shape=(input_dimension,))
-    ]
-    est = _baseline_regressor_fn(
-        label_dimension=label_dimension,
-        model_dir=self._model_dir)
-
-    # TRAIN
-    # learn y = x
-    est.train(train_input_fn, steps=200)
-
-    # EVALUTE
-    scores = est.evaluate(eval_input_fn)
-    self.assertEqual(200, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn(metric_keys.MetricKeys.LOSS, six.iterkeys(scores))
-
-    # PREDICT
-    predictions = np.array(
-        [x['predictions'] for x in est.predict(predict_input_fn)])
-    self.assertAllEqual((prediction_length, label_dimension), predictions.shape)
-
-    # EXPORT
-    feature_spec = feature_column_lib.make_parse_example_spec(feature_columns)
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
-                                       serving_input_receiver_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-
-  def test_numpy_input_fn(self):
-    """Tests complete flow with numpy_input_fn."""
-    label_dimension = 2
-    input_dimension = label_dimension
-    batch_size = 10
-    prediction_length = batch_size
-    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, label_dimension)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        num_epochs=1,
-        shuffle=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=None,
-        batch_size=batch_size,
-        num_epochs=1,
-        shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=input_dimension,
-        label_dimension=label_dimension,
-        prediction_length=prediction_length)
-
-  def test_pandas_input_fn(self):
-    """Tests complete flow with pandas_input_fn."""
-    if not HAS_PANDAS:
-      return
-
-    # Pandas DataFrame natually supports 1 dim data only.
-    label_dimension = 1
-    input_dimension = label_dimension
-    batch_size = 10
-    data = np.array([1., 2., 3., 4.], dtype=np.float32)
-    x = pd.DataFrame({'x': data})
-    y = pd.Series(data)
-    prediction_length = 4
-
-    train_input_fn = pandas_io.pandas_input_fn(
-        x=x, y=y, batch_size=batch_size, num_epochs=None, shuffle=True)
-    eval_input_fn = pandas_io.pandas_input_fn(
-        x=x, y=y, batch_size=batch_size, shuffle=False)
-    predict_input_fn = pandas_io.pandas_input_fn(
-        x=x, batch_size=batch_size, shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=input_dimension,
-        label_dimension=label_dimension,
-        prediction_length=prediction_length)
-
-  def test_input_fn_from_parse_example(self):
-    """Tests complete flow with input_fn constructed from parse_example."""
-    label_dimension = 2
-    input_dimension = label_dimension
-    batch_size = 10
-    prediction_length = batch_size
-    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, label_dimension)
-
-    serialized_examples = []
-    for datum in data:
-      example = example_pb2.Example(features=feature_pb2.Features(
-          feature={
-              'x':
-                  feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                      value=datum)),
-              'y':
-                  feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                      value=datum[:label_dimension])),
-          }))
-      serialized_examples.append(example.SerializeToString())
-
-    feature_spec = {
-        'x': parsing_ops.FixedLenFeature([input_dimension], dtypes.float32),
-        'y': parsing_ops.FixedLenFeature([label_dimension], dtypes.float32),
-    }
-
-    def _train_input_fn():
-      feature_map = parsing_ops.parse_example(serialized_examples, feature_spec)
-      features = queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-
-    def _eval_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-
-    def _predict_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = queue_parsed_features(feature_map)
-      features.pop('y')
-      return features, None
-
-    self._test_complete_flow(
-        train_input_fn=_train_input_fn,
-        eval_input_fn=_eval_input_fn,
-        predict_input_fn=_predict_input_fn,
-        input_dimension=input_dimension,
-        label_dimension=label_dimension,
-        prediction_length=prediction_length)
-
-
-class BaselineRegressorTrainingTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _mock_optimizer(self, expected_loss=None):
-    expected_var_names = [
-        '%s:0' % BIAS_NAME
-    ]
-
-    def _minimize(loss, global_step=None, var_list=None):
-      trainable_vars = var_list or ops.get_collection(
-          ops.GraphKeys.TRAINABLE_VARIABLES)
-      self.assertItemsEqual(expected_var_names,
-                            [var.name for var in trainable_vars])
-
-      # Verify loss. We can't check the value directly, so we add an assert op.
-      self.assertEquals(0, loss.shape.ndims)
-      if expected_loss is None:
-        if global_step is not None:
-          return state_ops.assign_add(global_step, 1).op
-        return control_flow_ops.no_op()
-      assert_loss = assert_close(
-          math_ops.to_float(expected_loss, name='expected'),
-          loss,
-          name='assert_loss')
-      with ops.control_dependencies((assert_loss,)):
-        if global_step is not None:
-          return state_ops.assign_add(global_step, 1).op
-        return control_flow_ops.no_op()
-
-    mock_optimizer = test.mock.NonCallableMock(
-        spec=optimizer.Optimizer,
-        wraps=optimizer.Optimizer(use_locking=False, name='my_optimizer'))
-    mock_optimizer.minimize = test.mock.MagicMock(wraps=_minimize)
-
-    # NOTE: Estimator.params performs a deepcopy, which wreaks havoc with mocks.
-    # So, return mock_optimizer itself for deepcopy.
-    mock_optimizer.__deepcopy__ = lambda _: mock_optimizer
-    return mock_optimizer
-
-  def _assert_checkpoint(self,
-                         label_dimension,
-                         expected_global_step,
-                         expected_bias=None):
-    shapes = {
-        name: shape
-        for (name, shape) in checkpoint_utils.list_variables(self._model_dir)
-    }
-
-    self.assertEqual([], shapes[ops.GraphKeys.GLOBAL_STEP])
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(self._model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
-
-    self.assertEqual([label_dimension], shapes[BIAS_NAME])
-    if expected_bias is not None:
-      self.assertEqual(expected_bias,
-                       checkpoint_utils.load_variable(self._model_dir,
-                                                      BIAS_NAME))
-
-  def testFromScratchWithDefaultOptimizer(self):
-    # Create BaselineRegressor.
-    label = 5.
-    age = 17
-    baseline_regressor = _baseline_regressor_fn(model_dir=self._model_dir)
-
-    # Train for a few steps, and validate final checkpoint.
-    num_steps = 10
-    baseline_regressor.train(
-        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
-    self._assert_checkpoint(label_dimension=1, expected_global_step=num_steps)
-
-  def testTrainWithOneDimLabel(self):
-    label_dimension = 1
-    batch_size = 20
-    est = _baseline_regressor_fn(
-        label_dimension=label_dimension,
-        model_dir=self._model_dir)
-    data_rank_1 = np.linspace(0., 2., batch_size, dtype=np.float32)
-    self.assertEqual((batch_size,), data_rank_1.shape)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'age': data_rank_1},
-        y=data_rank_1,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    est.train(train_input_fn, steps=200)
-    self._assert_checkpoint(label_dimension=1, expected_global_step=200)
-
-  def testTrainWithOneDimWeight(self):
-    label_dimension = 1
-    batch_size = 20
-    est = _baseline_regressor_fn(
-        label_dimension=label_dimension,
-        weight_column='w',
-        model_dir=self._model_dir)
-
-    data_rank_1 = np.linspace(0., 2., batch_size, dtype=np.float32)
-    self.assertEqual((batch_size,), data_rank_1.shape)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'age': data_rank_1,
-           'w': data_rank_1},
-        y=data_rank_1,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    est.train(train_input_fn, steps=200)
-    self._assert_checkpoint(label_dimension=1, expected_global_step=200)
-
-  def testFromScratch(self):
-    # Create BaselineRegressor.
-    label = 5.
-    age = 17
-    # loss = (logits - label)^2 = (0 - 5.)^2 = 25.
-    mock_optimizer = self._mock_optimizer(expected_loss=25.)
-    baseline_regressor = _baseline_regressor_fn(
-        model_dir=self._model_dir,
-        optimizer=mock_optimizer)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-
-    # Train for a few steps, and validate optimizer and final checkpoint.
-    num_steps = 10
-    baseline_regressor.train(
-        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-    self._assert_checkpoint(
-        label_dimension=1,
-        expected_global_step=num_steps,
-        expected_bias=[0.])
-
-  def testFromCheckpoint(self):
-    # Create initial checkpoint.
-    bias = 7.0
-    initial_global_step = 100
-    with ops.Graph().as_default():
-      variables.Variable([bias], name=BIAS_NAME)
-      variables.Variable(
-          initial_global_step,
-          name=ops.GraphKeys.GLOBAL_STEP,
-          dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    # logits = bias = 6.
-    # loss = (logits - label)^2 = (7 - 5)^2 = 4
-    mock_optimizer = self._mock_optimizer(expected_loss=4.)
-    baseline_regressor = _baseline_regressor_fn(
-        model_dir=self._model_dir,
-        optimizer=mock_optimizer)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-
-    # Train for a few steps, and validate optimizer and final checkpoint.
-    num_steps = 10
-    baseline_regressor.train(
-        input_fn=lambda: ({'age': ((17,),)}, ((5.,),)), steps=num_steps)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-    self._assert_checkpoint(
-        label_dimension=1,
-        expected_global_step=initial_global_step + num_steps,
-        expected_bias=[bias])
-
-  def testFromCheckpointMultiBatch(self):
-    # Create initial checkpoint.
-    bias = 5.0
-    initial_global_step = 100
-    with ops.Graph().as_default():
-      variables.Variable([bias], name=BIAS_NAME)
-      variables.Variable(
-          initial_global_step,
-          name=ops.GraphKeys.GLOBAL_STEP,
-          dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    # logits = bias
-    # logits[0] = 5.
-    # logits[1] = 5.
-    # loss = sum(logits - label)^2 = (5 - 5)^2 + (5 - 3)^2 = 4
-    mock_optimizer = self._mock_optimizer(expected_loss=4.)
-    baseline_regressor = _baseline_regressor_fn(
-        model_dir=self._model_dir,
-        optimizer=mock_optimizer)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-
-    # Train for a few steps, and validate optimizer and final checkpoint.
-    num_steps = 10
-    baseline_regressor.train(
-        input_fn=lambda: ({'age': ((17,), (15,))}, ((5.,), (3.,))),
-        steps=num_steps)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-    self._assert_checkpoint(
-        label_dimension=1,
-        expected_global_step=initial_global_step + num_steps,
-        expected_bias=bias)
-
-
-# Tests for Baseline Classifier.
-
-
-class BaselineClassifierTrainingTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      shutil.rmtree(self._model_dir)
-
-  def _mock_optimizer(self, expected_loss=None):
-    expected_var_names = [
-        '%s:0' % BIAS_NAME
-    ]
-
-    def _minimize(loss, global_step):
-      trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      self.assertItemsEqual(
-          expected_var_names,
-          [var.name for var in trainable_vars])
-
-      # Verify loss. We can't check the value directly, so we add an assert op.
-      self.assertEquals(0, loss.shape.ndims)
-      if expected_loss is None:
-        return state_ops.assign_add(global_step, 1).op
-      assert_loss = assert_close(
-          math_ops.to_float(expected_loss, name='expected'),
-          loss,
-          name='assert_loss')
-      with ops.control_dependencies((assert_loss,)):
-        return state_ops.assign_add(global_step, 1).op
-
-    mock_optimizer = test.mock.NonCallableMock(
-        spec=optimizer.Optimizer,
-        wraps=optimizer.Optimizer(use_locking=False, name='my_optimizer'))
-    mock_optimizer.minimize = test.mock.MagicMock(wraps=_minimize)
-
-    # NOTE: Estimator.params performs a deepcopy, which wreaks havoc with mocks.
-    # So, return mock_optimizer itself for deepcopy.
-    mock_optimizer.__deepcopy__ = lambda _: mock_optimizer
-    return mock_optimizer
-
-  def _assert_checkpoint(
-      self, n_classes, expected_global_step, expected_bias=None):
-    logits_dimension = n_classes if n_classes > 2 else 1
-
-    shapes = {
-        name: shape for (name, shape) in
-        checkpoint_utils.list_variables(self._model_dir)
-    }
-
-    self.assertEqual([], shapes[ops.GraphKeys.GLOBAL_STEP])
-    self.assertEqual(
-        expected_global_step,
-        checkpoint_utils.load_variable(
-            self._model_dir, ops.GraphKeys.GLOBAL_STEP))
-
-    self.assertEqual([logits_dimension], shapes[BIAS_NAME])
-    if expected_bias is not None:
-      self.assertAllEqual(expected_bias,
-                          checkpoint_utils.load_variable(
-                              self._model_dir, BIAS_NAME))
-
-  def _testFromScratchWithDefaultOptimizer(self, n_classes):
-    label = 0
-    age = 17
-    est = baseline.BaselineClassifier(
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-
-    # Train for a few steps, and validate final checkpoint.
-    num_steps = 10
-    est.train(
-        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
-    self._assert_checkpoint(n_classes, num_steps)
-
-  def testBinaryClassesFromScratchWithDefaultOptimizer(self):
-    self._testFromScratchWithDefaultOptimizer(n_classes=2)
-
-  def testMultiClassesFromScratchWithDefaultOptimizer(self):
-    self._testFromScratchWithDefaultOptimizer(n_classes=4)
-
-  def _testTrainWithTwoDimsLabel(self, n_classes):
-    batch_size = 20
-
-    est = baseline.BaselineClassifier(
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-    data_rank_1 = np.array([0, 1])
-    data_rank_2 = np.array([[0], [1]])
-    self.assertEqual((2,), data_rank_1.shape)
-    self.assertEqual((2, 1), data_rank_2.shape)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'age': data_rank_1},
-        y=data_rank_2,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    est.train(train_input_fn, steps=200)
-    self._assert_checkpoint(n_classes, 200)
-
-  def testBinaryClassesTrainWithTwoDimsLabel(self):
-    self._testTrainWithTwoDimsLabel(n_classes=2)
-
-  def testMultiClassesTrainWithTwoDimsLabel(self):
-    self._testTrainWithTwoDimsLabel(n_classes=4)
-
-  def _testTrainWithOneDimLabel(self, n_classes):
-    batch_size = 20
-
-    est = baseline.BaselineClassifier(
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-    data_rank_1 = np.array([0, 1])
-    self.assertEqual((2,), data_rank_1.shape)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'age': data_rank_1},
-        y=data_rank_1,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    est.train(train_input_fn, steps=200)
-    self._assert_checkpoint(n_classes, 200)
-
-  def testBinaryClassesTrainWithOneDimLabel(self):
-    self._testTrainWithOneDimLabel(n_classes=2)
-
-  def testMultiClassesTrainWithOneDimLabel(self):
-    self._testTrainWithOneDimLabel(n_classes=4)
-
-  def _testTrainWithTwoDimsWeight(self, n_classes):
-    batch_size = 20
-
-    est = baseline.BaselineClassifier(
-        weight_column='w',
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-    data_rank_1 = np.array([0, 1])
-    data_rank_2 = np.array([[0], [1]])
-    self.assertEqual((2,), data_rank_1.shape)
-    self.assertEqual((2, 1), data_rank_2.shape)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'age': data_rank_1, 'w': data_rank_2}, y=data_rank_1,
-        batch_size=batch_size, num_epochs=None,
-        shuffle=True)
-    est.train(train_input_fn, steps=200)
-    self._assert_checkpoint(n_classes, 200)
-
-  def testBinaryClassesTrainWithTwoDimsWeight(self):
-    self._testTrainWithTwoDimsWeight(n_classes=2)
-
-  def testMultiClassesTrainWithTwoDimsWeight(self):
-    self._testTrainWithTwoDimsWeight(n_classes=4)
-
-  def _testTrainWithOneDimWeight(self, n_classes):
-    batch_size = 20
-
-    est = baseline.BaselineClassifier(
-        weight_column='w',
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-    data_rank_1 = np.array([0, 1])
-    self.assertEqual((2,), data_rank_1.shape)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'age': data_rank_1, 'w': data_rank_1}, y=data_rank_1,
-        batch_size=batch_size, num_epochs=None,
-        shuffle=True)
-    est.train(train_input_fn, steps=200)
-    self._assert_checkpoint(n_classes, 200)
-
-  def testBinaryClassesTrainWithOneDimWeight(self):
-    self._testTrainWithOneDimWeight(n_classes=2)
-
-  def testMultiClassesTrainWithOneDimWeight(self):
-    self._testTrainWithOneDimWeight(n_classes=4)
-
-  def _testFromScratch(self, n_classes):
-    label = 1
-    age = 17
-    # For binary classifier:
-    #   loss = sigmoid_cross_entropy(logits, label) where logits=0 (weights are
-    #   all zero initially) and label = 1 so,
-    #      loss = 1 * -log ( sigmoid(logits) ) = 0.69315
-    # For multi class classifier:
-    #   loss = cross_entropy(logits, label) where logits are all 0s (weights are
-    #   all zero initially) and label = 1 so,
-    #      loss = 1 * -log ( 1.0 / n_classes )
-    # For this particular test case, as logits are same, the formula
-    # 1 * -log ( 1.0 / n_classes ) covers both binary and multi class cases.
-    mock_optimizer = self._mock_optimizer(
-        expected_loss=-1 * math.log(1.0/n_classes))
-
-    est = baseline.BaselineClassifier(
-        n_classes=n_classes,
-        optimizer=mock_optimizer,
-        model_dir=self._model_dir)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-
-    # Train for a few steps, and validate optimizer and final checkpoint.
-    num_steps = 10
-    est.train(
-        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-    self._assert_checkpoint(
-        n_classes,
-        expected_global_step=num_steps,
-        expected_bias=[0.] if n_classes == 2 else [.0] * n_classes)
-
-  def testBinaryClassesFromScratch(self):
-    self._testFromScratch(n_classes=2)
-
-  def testMultiClassesFromScratch(self):
-    self._testFromScratch(n_classes=4)
-
-  def _testFromCheckpoint(self, n_classes):
-    # Create initial checkpoint.
-    label = 1
-    age = 17
-    bias = [-1.0] if n_classes == 2 else [-1.0] * n_classes
-    initial_global_step = 100
-    with ops.Graph().as_default():
-      variables.Variable(bias, name=BIAS_NAME)
-      variables.Variable(
-          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
-          dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    # For binary classifier:
-    #   logits = bias = -1.
-    #   loss = sigmoid_cross_entropy(logits, label)
-    #   so, loss = 1 * -log ( sigmoid(-1) ) = 1.3133
-    # For multi class classifier:
-    #   loss = cross_entropy(logits, label)
-    #   where logits = bias and label = 1
-    #   so, loss = 1 * -log ( softmax(logits)[1] )
-    if n_classes == 2:
-      expected_loss = 1.3133
-    else:
-      logits = bias
-      logits_exp = np.exp(logits)
-      softmax = logits_exp / logits_exp.sum()
-      expected_loss = -1 * math.log(softmax[label])
-
-    mock_optimizer = self._mock_optimizer(expected_loss=expected_loss)
-
-    est = baseline.BaselineClassifier(
-        n_classes=n_classes,
-        optimizer=mock_optimizer,
-        model_dir=self._model_dir)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-
-    # Train for a few steps, and validate optimizer and final checkpoint.
-    num_steps = 10
-    est.train(
-        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-    self._assert_checkpoint(
-        n_classes,
-        expected_global_step=initial_global_step + num_steps,
-        expected_bias=bias)
-
-  def testBinaryClassesFromCheckpoint(self):
-    self._testFromCheckpoint(n_classes=2)
-
-  def testMultiClassesFromCheckpoint(self):
-    self._testFromCheckpoint(n_classes=4)
-
-  def _testFromCheckpointFloatLabels(self, n_classes):
-    """Tests float labels for binary classification."""
-    # Create initial checkpoint.
-    if n_classes > 2:
-      return
-    label = 0.8
-    age = 17
-    bias = [-1.0]
-    initial_global_step = 100
-    with ops.Graph().as_default():
-      variables.Variable(bias, name=BIAS_NAME)
-      variables.Variable(
-          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
-          dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    # logits = bias = -1.
-    # loss = sigmoid_cross_entropy(logits, label)
-    # => loss = -0.8 * log(sigmoid(-1)) -0.2 * log(sigmoid(+1)) = 1.1132617
-    mock_optimizer = self._mock_optimizer(expected_loss=1.1132617)
-
-    est = baseline.BaselineClassifier(
-        n_classes=n_classes,
-        optimizer=mock_optimizer,
-        model_dir=self._model_dir)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-
-    # Train for a few steps, and validate optimizer and final checkpoint.
-    num_steps = 10
-    est.train(
-        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-
-  def testBinaryClassesFromCheckpointFloatLabels(self):
-    self._testFromCheckpointFloatLabels(n_classes=2)
-
-  def testMultiClassesFromCheckpointFloatLabels(self):
-    self._testFromCheckpointFloatLabels(n_classes=4)
-
-  def _testFromCheckpointMultiBatch(self, n_classes):
-    # Create initial checkpoint.
-    label = [1, 0]
-    age = [17, 18.5]
-    # For binary case, the expected weight has shape (1,1). For multi class
-    # case, the shape is (1, n_classes). In order to test the weights, set
-    # weights as 2.0 * range(n_classes).
-    bias = [-1.0] if n_classes == 2 else [-1.0] * n_classes
-    initial_global_step = 100
-    with ops.Graph().as_default():
-      variables.Variable(bias, name=BIAS_NAME)
-      variables.Variable(
-          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
-          dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    # For binary classifier:
-    #   logits = bias
-    #   logits[0] = -1.
-    #   logits[1] = -1.
-    #   loss = sigmoid_cross_entropy(logits, label)
-    #   so, loss[0] = 1 * -log ( sigmoid(-1) ) = 1.3133
-    #       loss[1] = (1 - 0) * -log ( 1- sigmoid(-1) ) = 0.3132
-    # For multi class classifier:
-    #   loss = cross_entropy(logits, label)
-    #   where logits = bias and label = [1, 0]
-    #   so, loss = 1 * -log ( softmax(logits)[label] )
-    if n_classes == 2:
-      expected_loss = (1.3133 + 0.3132)
-    else:
-      # Expand logits since batch_size=2
-      logits = bias * np.ones(shape=(2, 1))
-      logits_exp = np.exp(logits)
-      softmax_row_0 = logits_exp[0] / logits_exp[0].sum()
-      softmax_row_1 = logits_exp[1] / logits_exp[1].sum()
-      expected_loss_0 = -1 * math.log(softmax_row_0[label[0]])
-      expected_loss_1 = -1 * math.log(softmax_row_1[label[1]])
-      expected_loss = expected_loss_0 + expected_loss_1
-
-    mock_optimizer = self._mock_optimizer(expected_loss=expected_loss)
-
-    est = baseline.BaselineClassifier(
-        n_classes=n_classes,
-        optimizer=mock_optimizer,
-        model_dir=self._model_dir)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-
-    # Train for a few steps, and validate optimizer and final checkpoint.
-    num_steps = 10
-    est.train(
-        input_fn=lambda: ({'age': (age)}, (label)),
-        steps=num_steps)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-    self._assert_checkpoint(
-        n_classes,
-        expected_global_step=initial_global_step + num_steps,
-        expected_bias=bias)
-
-  def testBinaryClassesFromCheckpointMultiBatch(self):
-    self._testFromCheckpointMultiBatch(n_classes=2)
-
-  def testMultiClassesFromCheckpointMultiBatch(self):
-    self._testFromCheckpointMultiBatch(n_classes=4)
-
-
-class BaselineClassifierEvaluationTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      shutil.rmtree(self._model_dir)
-
-  def _test_evaluation_for_simple_data(self, n_classes):
-    label = 1
-    age = 1.
-
-    bias = [-1.0] if n_classes == 2 else [-1.0] * n_classes
-
-    with ops.Graph().as_default():
-      variables.Variable(bias, name=BIAS_NAME)
-      variables.Variable(
-          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    est = _baseline_classifier_fn(
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-    eval_metrics = est.evaluate(
-        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=1)
-
-    if n_classes == 2:
-      # Binary classes: loss = -log(sigmoid(-1)) = 1.3133
-      # Prediction = sigmoid(-1) = 0.2689
-      expected_metrics = {
-          metric_keys.MetricKeys.LOSS: 1.3133,
-          ops.GraphKeys.GLOBAL_STEP: 100,
-          metric_keys.MetricKeys.LOSS_MEAN: 1.3133,
-          metric_keys.MetricKeys.ACCURACY: 0.,
-          metric_keys.MetricKeys.PREDICTION_MEAN: 0.2689,
-          metric_keys.MetricKeys.LABEL_MEAN: 1.,
-          metric_keys.MetricKeys.ACCURACY_BASELINE: 1,
-          metric_keys.MetricKeys.AUC: 0.,
-          metric_keys.MetricKeys.AUC_PR: 1.,
-      }
-    else:
-      # Multi classes: loss = 1 * -log ( softmax(logits)[label] )
-      logits = bias
-      logits_exp = np.exp(logits)
-      softmax = logits_exp / logits_exp.sum()
-      expected_loss = -1 * math.log(softmax[label])
-
-      expected_metrics = {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          ops.GraphKeys.GLOBAL_STEP: 100,
-          metric_keys.MetricKeys.LOSS_MEAN: expected_loss,
-          metric_keys.MetricKeys.ACCURACY: 0.,
-      }
-
-    self.assertAllClose(sorted_key_dict(expected_metrics),
-                        sorted_key_dict(eval_metrics), rtol=1e-3)
-
-  def test_binary_classes_evaluation_for_simple_data(self):
-    self._test_evaluation_for_simple_data(n_classes=2)
-
-  def test_multi_classes_evaluation_for_simple_data(self):
-    self._test_evaluation_for_simple_data(n_classes=4)
-
-  def _test_evaluation_batch(self, n_classes):
-    """Tests evaluation for batch_size==2."""
-    label = [1, 0]
-    age = [17., 18.]
-    bias = [-1.0] if n_classes == 2 else [-1.0] * n_classes
-    initial_global_step = 100
-    with ops.Graph().as_default():
-      variables.Variable(bias, name=BIAS_NAME)
-      variables.Variable(
-          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
-          dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    est = _baseline_classifier_fn(
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-    eval_metrics = est.evaluate(
-        input_fn=lambda: ({'age': (age)}, (label)), steps=1)
-
-    if n_classes == 2:
-      # Logits are (-1., -1.) labels are (1, 0).
-      # Loss is
-      #   loss for row 1: 1 * -log(sigmoid(-1)) = 1.3133
-      #   loss for row 2: (1 - 0) * -log(1 - sigmoid(-1)) = 0.3132
-      # Prediction = sigmoid(-1) = 0.2689
-      expected_loss = 1.3133 + 0.3132
-
-      expected_metrics = {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          ops.GraphKeys.GLOBAL_STEP: 100,
-          metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2,
-          metric_keys.MetricKeys.ACCURACY: 0.5,
-          metric_keys.MetricKeys.PREDICTION_MEAN: 0.2689,
-          metric_keys.MetricKeys.LABEL_MEAN: 0.5,
-          metric_keys.MetricKeys.ACCURACY_BASELINE: 0.5,
-          metric_keys.MetricKeys.AUC: 0.5,
-          metric_keys.MetricKeys.AUC_PR: 0.75,
-      }
-    else:
-      # Expand logits since batch_size=2
-      logits = bias * np.ones(shape=(2, 1))
-      logits_exp = np.exp(logits)
-      softmax_row_0 = logits_exp[0] / logits_exp[0].sum()
-      softmax_row_1 = logits_exp[1] / logits_exp[1].sum()
-      expected_loss_0 = -1 * math.log(softmax_row_0[label[0]])
-      expected_loss_1 = -1 * math.log(softmax_row_1[label[1]])
-      expected_loss = expected_loss_0 + expected_loss_1
-
-      expected_metrics = {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          ops.GraphKeys.GLOBAL_STEP: 100,
-          metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2,
-          metric_keys.MetricKeys.ACCURACY: 0.5,
-      }
-
-    self.assertAllClose(sorted_key_dict(expected_metrics),
-                        sorted_key_dict(eval_metrics), rtol=1e-3)
-
-  def test_binary_classes_evaluation_batch(self):
-    self._test_evaluation_batch(n_classes=2)
-
-  def test_multi_classes_evaluation_batch(self):
-    self._test_evaluation_batch(n_classes=4)
-
-  def _test_evaluation_weights(self, n_classes):
-    """Tests evaluation with weights."""
-
-    label = [1, 0]
-    age = [17., 18.]
-    weights = [1., 2.]
-    # For binary case, the expected weight has shape (1,1). For multi class
-    # case, the shape is (1, n_classes). In order to test the weights, set
-    # weights as 2.0 * range(n_classes).
-    bias = [-1.0] if n_classes == 2 else [-1.0] * n_classes
-    initial_global_step = 100
-    with ops.Graph().as_default():
-      variables.Variable(bias, name=BIAS_NAME)
-      variables.Variable(
-          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
-          dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    est = _baseline_classifier_fn(
-        n_classes=n_classes,
-        weight_column='w',
-        model_dir=self._model_dir)
-    eval_metrics = est.evaluate(
-        input_fn=lambda: ({'age': (age), 'w': (weights)}, (label)), steps=1)
-
-    if n_classes == 2:
-      # Logits are (-1., -1.) labels are (1, 0).
-      # Loss is
-      #   loss for row 1: 1 * -log(sigmoid(-1)) = 1.3133
-      #   loss for row 2: (1 - 0) * -log(1 - sigmoid(-1)) = 0.3132
-      #   weights = [1., 2.]
-      expected_loss = 1.3133 * 1. + 0.3132 * 2.
-      loss_mean = expected_loss / (1.0 + 2.0)
-      label_mean = np.average(label, weights=weights)
-      logits = [-1, -1]
-      logistics = sigmoid(np.array(logits))
-      predictions_mean = np.average(logistics, weights=weights)
-
-      expected_metrics = {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          ops.GraphKeys.GLOBAL_STEP: 100,
-          metric_keys.MetricKeys.LOSS_MEAN: loss_mean,
-          metric_keys.MetricKeys.ACCURACY: 2. / (1. + 2.),
-          metric_keys.MetricKeys.PREDICTION_MEAN: predictions_mean,
-          metric_keys.MetricKeys.LABEL_MEAN: label_mean,
-          metric_keys.MetricKeys.ACCURACY_BASELINE: (
-              max(label_mean, 1-label_mean)),
-          metric_keys.MetricKeys.AUC: 0.5,
-          metric_keys.MetricKeys.AUC_PR: 2. / (1. + 2.),
-      }
-    else:
-      # Multi classes: unweighted_loss = 1 * -log ( soft_max(logits)[label] )
-      # Expand logits since batch_size=2
-      logits = bias * np.ones(shape=(2, 1))
-      logits_exp = np.exp(logits)
-      softmax_row_0 = logits_exp[0] / logits_exp[0].sum()
-      softmax_row_1 = logits_exp[1] / logits_exp[1].sum()
-      expected_loss_0 = -1 * math.log(softmax_row_0[label[0]])
-      expected_loss_1 = -1 * math.log(softmax_row_1[label[1]])
-      loss_mean = np.average([expected_loss_0, expected_loss_1],
-                             weights=weights)
-      expected_loss = loss_mean * np.sum(weights)
-
-      expected_metrics = {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          ops.GraphKeys.GLOBAL_STEP: 100,
-          metric_keys.MetricKeys.LOSS_MEAN: loss_mean,
-          metric_keys.MetricKeys.ACCURACY: 2. / (1. + 2.),
-      }
-
-    self.assertAllClose(sorted_key_dict(expected_metrics),
-                        sorted_key_dict(eval_metrics), rtol=1e-3)
-
-  def test_binary_classes_evaluation_weights(self):
-    self._test_evaluation_weights(n_classes=2)
-
-  def test_multi_classes_evaluation_weights(self):
-    self._test_evaluation_weights(n_classes=4)
-
-
-class BaselineClassifierPredictTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      shutil.rmtree(self._model_dir)
-
-  def _testPredictions(self, n_classes, label_vocabulary, label_output_fn):
-    """Tests predict when all variables are one-dimensional."""
-    age = 1.
-
-    bias = [10.0] if n_classes == 2 else [10.0] * n_classes
-
-    with ops.Graph().as_default():
-      variables.Variable(bias, name=BIAS_NAME)
-      variables.Variable(100, name='global_step', dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    est = _baseline_classifier_fn(
-        label_vocabulary=label_vocabulary,
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'age': np.array([[age]])},
-        y=None,
-        batch_size=1,
-        num_epochs=1,
-        shuffle=False)
-    predictions = list(est.predict(input_fn=predict_input_fn))
-
-    if n_classes == 2:
-      scalar_logits = bias[0]
-      two_classes_logits = [0, scalar_logits]
-      two_classes_logits_exp = np.exp(two_classes_logits)
-      softmax = two_classes_logits_exp / two_classes_logits_exp.sum()
-
-      expected_predictions = {
-          'class_ids': [1],
-          'classes': [label_output_fn(1)],
-          'logistic': [sigmoid(np.array(scalar_logits))],
-          'logits': [scalar_logits],
-          'probabilities': softmax,
-      }
-    else:
-      onedim_logits = np.array(bias)
-      class_ids = onedim_logits.argmax()
-      logits_exp = np.exp(onedim_logits)
-      softmax = logits_exp / logits_exp.sum()
-      expected_predictions = {
-          'class_ids': [class_ids],
-          'classes': [label_output_fn(class_ids)],
-          'logits': onedim_logits,
-          'probabilities': softmax,
-      }
-
-    self.assertEqual(1, len(predictions))
-    # assertAllClose cannot handle byte type.
-    self.assertEqual(expected_predictions['classes'], predictions[0]['classes'])
-    expected_predictions.pop('classes')
-    predictions[0].pop('classes')
-    self.assertAllClose(sorted_key_dict(expected_predictions),
-                        sorted_key_dict(predictions[0]))
-
-  def testBinaryClassesWithoutLabelVocabulary(self):
-    n_classes = 2
-    self._testPredictions(n_classes,
-                          label_vocabulary=None,
-                          label_output_fn=lambda x: ('%s' % x).encode())
-
-  def testBinaryClassesWithLabelVocabulary(self):
-    n_classes = 2
-    self._testPredictions(
-        n_classes,
-        label_vocabulary=['class_vocab_{}'.format(i)
-                          for i in range(n_classes)],
-        label_output_fn=lambda x: ('class_vocab_%s' % x).encode())
-
-  def testMultiClassesWithoutLabelVocabulary(self):
-    n_classes = 4
-    self._testPredictions(
-        n_classes,
-        label_vocabulary=None,
-        label_output_fn=lambda x: ('%s' % x).encode())
-
-  def testMultiClassesWithLabelVocabulary(self):
-    n_classes = 4
-    self._testPredictions(
-        n_classes,
-        label_vocabulary=['class_vocab_{}'.format(i)
-                          for i in range(n_classes)],
-        label_output_fn=lambda x: ('class_vocab_%s' % x).encode())
-
-
-class BaselineClassifierIntegrationTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      shutil.rmtree(self._model_dir)
-
-  def _test_complete_flow(self, n_classes, train_input_fn, eval_input_fn,
-                          predict_input_fn, input_dimension, prediction_length):
-    feature_columns = [
-        feature_column_lib.numeric_column('x', shape=(input_dimension,))
-    ]
-    est = _baseline_classifier_fn(
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-
-    # TRAIN
-    # learn y = x
-    est.train(train_input_fn, steps=200)
-
-    # EVALUTE
-    scores = est.evaluate(eval_input_fn)
-    self.assertEqual(200, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn(metric_keys.MetricKeys.LOSS, six.iterkeys(scores))
-
-    # PREDICT
-    predictions = np.array(
-        [x['classes'] for x in est.predict(predict_input_fn)])
-    self.assertAllEqual((prediction_length, 1), predictions.shape)
-
-    # EXPORT
-    feature_spec = feature_column_lib.make_parse_example_spec(feature_columns)
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
-                                       serving_input_receiver_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-
-  def _test_numpy_input_fn(self, n_classes):
-    """Tests complete flow with numpy_input_fn."""
-    input_dimension = 4
-    batch_size = 10
-    prediction_length = batch_size
-    data = np.linspace(0., 2., batch_size * input_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, input_dimension)
-    target = np.array([1] * batch_size)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=target,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=target,
-        batch_size=batch_size,
-        num_epochs=1,
-        shuffle=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=None,
-        batch_size=batch_size,
-        num_epochs=1,
-        shuffle=False)
-
-    self._test_complete_flow(
-        n_classes=n_classes,
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=input_dimension,
-        prediction_length=prediction_length)
-
-  def test_binary_classes_numpy_input_fn(self):
-    self._test_numpy_input_fn(n_classes=2)
-
-  def test_multi_classes_numpy_input_fn(self):
-    self._test_numpy_input_fn(n_classes=4)
-
-  def _test_pandas_input_fn(self, n_classes):
-    """Tests complete flow with pandas_input_fn."""
-    if not HAS_PANDAS:
-      return
-
-    # Pandas DataFrame natually supports 1 dim data only.
-    input_dimension = 1
-    batch_size = 10
-    data = np.array([1., 2., 3., 4.], dtype=np.float32)
-    target = np.array([1, 0, 1, 0], dtype=np.int32)
-    x = pd.DataFrame({'x': data})
-    y = pd.Series(target)
-    prediction_length = 4
-
-    train_input_fn = pandas_io.pandas_input_fn(
-        x=x, y=y, batch_size=batch_size, num_epochs=None, shuffle=True)
-    eval_input_fn = pandas_io.pandas_input_fn(
-        x=x, y=y, batch_size=batch_size, shuffle=False)
-    predict_input_fn = pandas_io.pandas_input_fn(
-        x=x, batch_size=batch_size, shuffle=False)
-
-    self._test_complete_flow(
-        n_classes=n_classes,
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=input_dimension,
-        prediction_length=prediction_length)
-
-  def test_binary_classes_pandas_input_fn(self):
-    self._test_pandas_input_fn(n_classes=2)
-
-  def test_multi_classes_pandas_input_fn(self):
-    self._test_pandas_input_fn(n_classes=4)
-
-  def _test_input_fn_from_parse_example(self, n_classes):
-    """Tests complete flow with input_fn constructed from parse_example."""
-    input_dimension = 2
-    batch_size = 10
-    prediction_length = batch_size
-    data = np.linspace(0., 2., batch_size * input_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, input_dimension)
-    target = np.array([1] * batch_size, dtype=np.int64)
-
-    serialized_examples = []
-    for x, y in zip(data, target):
-      example = example_pb2.Example(features=feature_pb2.Features(
-          feature={
-              'x':
-                  feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                      value=x)),
-              'y':
-                  feature_pb2.Feature(int64_list=feature_pb2.Int64List(
-                      value=[y])),
-          }))
-      serialized_examples.append(example.SerializeToString())
-
-    feature_spec = {
-        'x': parsing_ops.FixedLenFeature([input_dimension], dtypes.float32),
-        'y': parsing_ops.FixedLenFeature([1], dtypes.int64),
-    }
-
-    def _train_input_fn():
-      feature_map = parsing_ops.parse_example(serialized_examples, feature_spec)
-      features = queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-
-    def _eval_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-
-    def _predict_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = queue_parsed_features(feature_map)
-      features.pop('y')
-      return features, None
-
-    self._test_complete_flow(
-        n_classes=n_classes,
-        train_input_fn=_train_input_fn,
-        eval_input_fn=_eval_input_fn,
-        predict_input_fn=_predict_input_fn,
-        input_dimension=input_dimension,
-        prediction_length=prediction_length)
-
-  def test_binary_classes_input_fn_from_parse_example(self):
-    self._test_input_fn_from_parse_example(n_classes=2)
-
-  def test_multi_classes_input_fn_from_parse_example(self):
-    self._test_input_fn_from_parse_example(n_classes=4)
-
-
-# Tests for Baseline logit_fn.
-
-
-class BaselineLogitFnTest(test.TestCase):
-
-  def test_basic_logit_correctness(self):
-    """baseline_logit_fn simply returns the bias variable."""
-    with ops.Graph().as_default():
-      logit_fn = baseline._baseline_logit_fn_builder(num_outputs=2)
-      logits = logit_fn(features={'age': [[23.], [31.]]})
-      with variable_scope.variable_scope('baseline', reuse=True):
-        bias_var = variable_scope.get_variable('bias')
-      with tf_session.Session() as sess:
-        sess.run([variables.global_variables_initializer()])
-        self.assertAllClose([[0., 0.], [0., 0.]], logits.eval())
-        sess.run(bias_var.assign([10., 5.]))
-        self.assertAllClose([[10., 5.], [10., 5.]], logits.eval())
-
-
-if __name__ == '__main__':
-  test.main()
-
diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
index eaed412c8b..01c00621ce 100644
--- a/tensorflow/python/estimator/canned/head.py
+++ b/tensorflow/python/estimator/canned/head.py
@@ -117,7 +117,7 @@ class _Head(object):
       update_op = tf.contrib.layers.optimize_loss(optimizer=sync,
                                                   loss=estimator_spec.loss, ...)
       hooks = [sync.make_session_run_hook(is_chief)]
-      ... upate train_op and hooks in EstimatorSpec and return
+      ... update train_op and hooks in EstimatorSpec and return
     ```
   """
   __metaclass__ = abc.ABCMeta
@@ -264,55 +264,26 @@ def _check_dense_labels_match_logits_and_reshape(
         return array_ops.identity(labels, name=scope)
 
 
-def _get_weights_and_check_match_logits(
-    features, weight_column, logits, allow_per_logit_weights=False):
-  """Fetches weights from features and checks that the shape matches logits.
+def _check_weights_match_logits_and_reshape(weights, logits):
+  """Checks that weights shape matches logits and reshapes if needed.
 
   Consider logits of shape [D0, D1, ... DN, logits_dimension]. Weights shape
   can be either:
-  * [D0, D1, ... DN, logits_dimension] if `allow_per_logit_weights=True`.
+  * [D0, D1, ... DN, logits_dimension]
   * [D0, D1, ... DN, 1]
   * [D0, D1, ... DN]: In this case, weights is reshaped into
     [D0, D1, ... DN, 1] to work with weight broadcasting rules.
 
   Args:
-    features: The features dict that contains weights.
-    weight_column: The weight column. If not given, this method returns 1.
+    weights: weights Tensor.
     logits: logits Tensor.
-    allow_per_logit_weights: Boolean. Whether we allow weights along the logits
-      dimension, namely shape `[D0, D1, ... DN, logits_dimension]`.
   Returns:
     Validated and reshaped weights Tensor.
-  Raises:
-    ValueError: If the weights `Tensor` cannot be cast into float.
   """
-  if allow_per_logit_weights:
-    err_msg = (
-        'weights shape must be [D0, D1, ... DN], [D0, D1, ... DN, 1] or '
-        '[D0, D1, ... DN, logits_dimension]')
-  else:
-    err_msg = (
-        'weights shape must be [D0, D1, ... DN] or [D0, D1, ... DN, 1]')
-  with ops.name_scope(
-      None, 'weights',
-      values=tuple(six.itervalues(features)) + (logits,)) as scope:
-    # Fetch the weights.
-    if weight_column is None:
-      return 1.
-    if isinstance(weight_column, six.string_types):
-      weight_column = feature_column_lib.numeric_column(
-          key=weight_column, shape=(1,))
-    if not isinstance(weight_column, feature_column_lib._NumericColumn):  # pylint: disable=protected-access
-      raise TypeError('Weight column must be either a string or _NumericColumn.'
-                      ' Given type: {}.'.format(type(weight_column)))
-    weights = weight_column._get_dense_tensor(  # pylint: disable=protected-access
-        feature_column_lib._LazyBuilder(features))  # pylint: disable=protected-access
-    if not (weights.dtype.is_floating or weights.dtype.is_integer):
-      raise ValueError('Weight column should be castable to float. '
-                       'Given dtype: {}'.format(weights.dtype))
-    weights = math_ops.to_float(weights, name='weights')
-
-    # Validate the weights shape.
+  err_msg = (
+      'weights shape must be [D0, D1, ... DN], [D0, D1, ... DN, 1] or '
+      '[D0, D1, ... DN, logits_dimension]')
+  with ops.name_scope(None, 'weights', (weights, logits)) as scope:
     weights_shape = array_ops.shape(weights, name='weights_shape')
     logits_shape = array_ops.shape(logits, name='logits_shape')
     if (weights.shape.ndims is not None and logits.shape.ndims is not None and
@@ -324,24 +295,42 @@ def _get_weights_and_check_match_logits(
       with ops.control_dependencies([assert_dimension]):
         return array_ops.expand_dims(weights, -1, name=scope)
     supported_weights_shape = array_ops.concat([logits_shape[:-1], [1]], axis=0)
-    if allow_per_logit_weights:
-      condition = math_ops.reduce_any(
-          [math_ops.reduce_all(math_ops.equal(logits_shape, weights_shape)),
-           math_ops.reduce_all(math_ops.equal(
-               supported_weights_shape, weights_shape))])
-      assert_dimension = control_flow_ops.Assert(
-          condition=condition,
-          data=[err_msg, 'logits_shape: ', logits_shape,
-                'weights_shape: ', weights_shape])
-    else:
-      assert_dimension = check_ops.assert_equal(
-          supported_weights_shape, weights_shape, message=err_msg,
-          data=['logits_shape: ', logits_shape,
-                'weights_shape: ', weights_shape])
+    condition = math_ops.reduce_any(
+        [math_ops.reduce_all(math_ops.equal(logits_shape, weights_shape)),
+         math_ops.reduce_all(math_ops.equal(
+             supported_weights_shape, weights_shape))])
+    assert_dimension = control_flow_ops.Assert(
+        condition=condition,
+        data=[err_msg, 'logits_shape: ', logits_shape,
+              'weights_shape: ', weights_shape])
     with ops.control_dependencies([assert_dimension]):
       return array_ops.identity(weights, name=scope)
 
 
+# TODO(roumposg): Delete once all heads support multi-dim input.
+def _check_logits(logits, expected_logits_dimension):
+  """Check logits type and shape."""
+  with ops.name_scope(None, 'logits', (logits,)) as scope:
+    logits = math_ops.to_float(logits)
+    logits_shape = array_ops.shape(logits)
+    assert_rank = check_ops.assert_rank(
+        logits, 2, data=[logits_shape],
+        message='logits shape must be [batch_size, logits_dimension]')
+    with ops.control_dependencies([assert_rank]):
+      static_shape = logits.shape
+      if static_shape is not None:
+        dim1 = static_shape[1]
+        if (dim1 is not None) and (dim1 != expected_logits_dimension):
+          raise ValueError(
+              'logits shape must be [batch_size, logits_dimension], got %s.' %
+              (static_shape,))
+      assert_dimension = check_ops.assert_equal(
+          expected_logits_dimension, logits_shape[1], data=[logits_shape],
+          message='logits shape must be [batch_size, logits_dimension]')
+      with ops.control_dependencies([assert_dimension]):
+        return array_ops.identity(logits, name=scope)
+
+
 def _check_logits_final_dim(logits, expected_logits_dimension):
   """Checks that logits shape is [D0, D1, ... DN, logits_dimension]."""
   with ops.name_scope(None, 'logits', (logits,)) as scope:
@@ -586,8 +575,10 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
         labels=label_ids, logits=logits, reduction=losses.Reduction.NONE)
     # Restore the squeezed dim, so unweighted_loss matches the weights shape.
     unweighted_loss = array_ops.expand_dims(unweighted_loss, axis=-1)
-    weights = _get_weights_and_check_match_logits(
-        features=features, weight_column=self._weight_column, logits=logits)
+    weights = _weights(features, self._weight_column)
+    if self._weight_column is not None:
+      weights = _check_weights_match_logits_and_reshape(
+          weights=weights, logits=logits)
     weighted_sum_loss = losses.compute_weighted_loss(
         unweighted_loss, weights=weights, reduction=losses.Reduction.SUM)
     # _weights() can return 1.
@@ -689,7 +680,7 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
 
 def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
     weight_column=None, thresholds=None, label_vocabulary=None, name=None):
-  """Creates a `_Head` for single label binary classification.
+  """Creates a `Head` for single label binary classification.
 
   This head uses `sigmoid_cross_entropy_with_logits` loss.
 
@@ -727,7 +718,7 @@ def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
   Returns:
-    An instance of `_Head` for binary classification.
+    An instance of `Head` for binary classification.
 
   Raises:
     ValueError: if `thresholds` contains a value outside of `(0, 1)`.
@@ -861,8 +852,10 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
     labels = _assert_range(labels, 2)
     unweighted_loss = nn.sigmoid_cross_entropy_with_logits(
         labels=labels, logits=logits)
-    weights = _get_weights_and_check_match_logits(
-        features=features, weight_column=self._weight_column, logits=logits)
+    weights = _weights(features, self._weight_column)
+    if self._weight_column is not None:
+      weights = _check_weights_match_logits_and_reshape(
+          weights=weights, logits=logits)
     weighted_sum_loss = losses.compute_weighted_loss(
         unweighted_loss, weights=weights, reduction=losses.Reduction.SUM)
     # _weights() can return 1.
@@ -925,8 +918,12 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
 
       # Eval.
       if mode == model_fn.ModeKeys.EVAL:
-        weights = _get_weights_and_check_match_logits(
-            features=features, weight_column=self._weight_column, logits=logits)
+        weights = _weights(features, self._weight_column)
+        # TODO(roumposg): Merge this logic inside _weights once all heads
+        # support multi-dimensional inputs.
+        if self._weight_column is not None:
+          weights = _check_weights_match_logits_and_reshape(
+              weights=weights, logits=logits)
         return model_fn.EstimatorSpec(
             mode=model_fn.ModeKeys.EVAL,
             predictions=predictions,
@@ -960,7 +957,7 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
 def _regression_head_with_mean_squared_error_loss(weight_column=None,
                                                   label_dimension=1,
                                                   name=None):
-  """Creates a `_Head` for regression using the `mean_squared_error` loss.
+  """Creates a `_Head` for regression using the mean squared loss.
 
   The loss is the weighted sum over all input dimensions. Namely, if the input
   labels have shape `[batch_size, label_dimension]`, the loss is the weighted
@@ -1026,9 +1023,10 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
     labels = math_ops.to_float(labels)
     unweighted_loss = losses.mean_squared_error(
         labels=labels, predictions=logits, reduction=losses.Reduction.NONE)
-    weights = _get_weights_and_check_match_logits(
-        features=features, weight_column=self._weight_column, logits=logits,
-        allow_per_logit_weights=True)
+    weights = _weights(features, self._weight_column)
+    if self._weight_column is not None:
+      weights = _check_weights_match_logits_and_reshape(
+          weights=weights, logits=logits)
     weighted_sum_loss = losses.compute_weighted_loss(
         unweighted_loss, weights=weights, reduction=losses.Reduction.SUM)
     # _weights() can return 1.
@@ -1113,19 +1111,18 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
         train_op=train_op_fn(weighted_sum_loss))
 
 
-def _assert_range(labels, n_classes, message=None):
+def _assert_range(labels, n_classes):
   with ops.name_scope(None, 'assert_range', (labels,)):
     assert_less = check_ops.assert_less(
         labels,
         ops.convert_to_tensor(n_classes, dtype=labels.dtype),
-        message=message or 'Label IDs must < n_classes')
+        message='Label IDs must < n_classes')
     assert_greater = check_ops.assert_non_negative(
-        labels, message=message or 'Label IDs must >= 0')
+        labels, message='Label IDs must >= 0')
     with ops.control_dependencies((assert_less, assert_greater)):
       return array_ops.identity(labels)
 
 
-# TODO(b/69000400): Delete this method.
 def _weights(features, weight_column):
   """Fetches weights from features."""
   with ops.name_scope(None, 'weights', values=features.values()):
diff --git a/tensorflow/python/estimator/canned/head_test.py b/tensorflow/python/estimator/canned/head_test.py
index 4497cd26f2..0a4ea7d81c 100644
--- a/tensorflow/python/estimator/canned/head_test.py
+++ b/tensorflow/python/estimator/canned/head_test.py
@@ -987,14 +987,12 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
         spec.loss.eval()
 
   def test_multi_dim_train_weights_wrong_outer_dim(self):
-    """Logits of shape [2, 2, 2], labels [2, 2, 1], weights [2, 2, 3]."""
+    """Logits of shape [2, 2, 2], labels [2, 2, 1], weights [2, 2, 2]."""
     head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
         n_classes=3, weight_column='weights')
     logits = np.array([[[10, 0, 0], [12, 0, 0]],
                        [[0, 10, 0], [0, 15, 0]]], dtype=np.float32)
     labels = np.array([[[0], [1]], [[1], [2]]], dtype=np.int64)
-    weights = np.array([[[1., 1.1, 1.2], [1.5, 1.6, 1.7]],
-                        [[2., 2.1, 2.2], [2.5, 2.6, 2.7]]])
     weights_placeholder = array_ops.placeholder(dtype=dtypes.float32)
     def _no_op_train_fn(loss):
       del loss
@@ -1010,8 +1008,10 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
       _initialize_variables(self, monitored_session.Scaffold())
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
-          r'\[logits_shape: \]\s\[2 2 3\]\s\[weights_shape: \]\s\[2 2 3\]'):
-        spec.loss.eval({weights_placeholder: weights})
+          r'\[logits_shape: \]\s\[2 2 3\]\s\[weights_shape: \]\s\[2 2 2\]'):
+        spec.loss.eval({
+            weights_placeholder: np.array([[[1., 1.1], [1.5, 1.6]],
+                                           [[2., 2.1], [2.5, 2.6]]])})
 
   def test_multi_dim_weighted_eval(self):
     """Logits of shape [2, 2, 2], labels [2, 2, 1], weights [2, 2]."""
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 2d036e2cfb..a730e107ba 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -537,7 +537,7 @@ class Estimator(object):
       temp_export_dir = get_temp_export_dir(export_dir)
 
       # TODO(soergel): Consider whether MonitoredSession makes sense here
-      with tf_session.Session(config=self._session_config) as session:
+      with tf_session.Session() as session:
 
         saver_for_restore = estimator_spec.scaffold.saver or saver.Saver(
             sharded=True)
diff --git a/tensorflow/python/estimator/estimator_lib.py b/tensorflow/python/estimator/estimator_lib.py
index bed2b67419..5b82fd75ff 100644
--- a/tensorflow/python/estimator/estimator_lib.py
+++ b/tensorflow/python/estimator/estimator_lib.py
@@ -19,8 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import
-from tensorflow.python.estimator.canned.baseline import BaselineClassifier
-from tensorflow.python.estimator.canned.baseline import BaselineRegressor
 from tensorflow.python.estimator.canned.dnn import DNNClassifier
 from tensorflow.python.estimator.canned.dnn import DNNRegressor
 from tensorflow.python.estimator.canned.dnn_linear_combined import DNNLinearCombinedClassifier
@@ -48,8 +46,6 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
     # Canned Estimators
-    'BaselineClassifier',
-    'BaselineRegressor',
     'DNNClassifier',
     'DNNRegressor',
     'DNNLinearCombinedClassifier',
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index c1b773b8c4..2b9b44523b 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -50,7 +50,6 @@ from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import gfile
@@ -1911,71 +1910,6 @@ class EstimatorExportTest(test.TestCase):
     est.train(dummy_input_fn, steps=1)
     est.export_savedmodel(tempfile.mkdtemp(), serving_input_receiver_fn)
 
-  def test_export_savedmodel_respects_soft_placement(self):
-    def model_fn_with_a_gpu_op_but_no_kernel(features, labels, mode):
-      _, _ = features, labels
-      table = saver_test_utils.CheckpointedOp(name='v2')
-
-      update_global_step = state_ops.assign_add(training.get_global_step(), 1)
-      with ops.control_dependencies([update_global_step]):
-        train_op = table.insert('k1', 30.0)
-
-      #  In this test, there are no GPUs available.  The goal is to verify that
-      #  export_savedmodel executes nevertheless.
-      with ops.device('/gpu:0'):
-        string_op = string_ops.as_string(update_global_step)
-
-      with ops.control_dependencies([string_op]):
-        prediction = table.lookup('k1', 0.0)
-
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          predictions=prediction,
-          loss=constant_op.constant(1.),
-          train_op=train_op,
-          export_outputs={
-              'test': export_output.PredictOutput({
-                  'prediction': prediction
-              })
-          })
-
-    tmpdir = tempfile.mkdtemp()
-    est = estimator.Estimator(
-        model_fn=model_fn_with_a_gpu_op_but_no_kernel)
-    est.train(input_fn=dummy_input_fn, steps=1)
-    feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
-                    'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('export'))
-
-    export_dir = est.export_savedmodel(
-        export_dir_base, serving_input_receiver_fn)
-
-    # At this point, if export_savedmodel executed with
-    # allow_soft_placement=True, then the GPU-assigned operation was silently
-    # placed on the CPU.  Otherwise, an exception would have been raised
-    # related to the fact that the requested GPU device isn't available.
-
-    # Expectations below assume that export_savedmodel has completed normally.
-    self.assertTrue(gfile.Exists(export_dir_base))
-    self.assertTrue(gfile.Exists(export_dir))
-    self.assertTrue(gfile.Exists(os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes('saved_model.pb'))))
-    self.assertTrue(gfile.Exists(os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes('variables'))))
-    self.assertTrue(gfile.Exists(os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes('variables/variables.index'))))
-    self.assertTrue(gfile.Exists(os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes('variables/variables.data-00000-of-00001'))))
-
-    gfile.DeleteRecursively(tmpdir)
-
 
 class EstimatorHookOrderingTest(test.TestCase):
 
diff --git a/tensorflow/python/estimator/inputs/numpy_io.py b/tensorflow/python/estimator/inputs/numpy_io.py
index c9f37f06e8..3512f66284 100644
--- a/tensorflow/python/estimator/inputs/numpy_io.py
+++ b/tensorflow/python/estimator/inputs/numpy_io.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+from six import string_types
 from tensorflow.python.estimator.inputs.queues import feeding_functions
 
 # Key name to pack the target into dict of `features`. See
@@ -51,8 +52,9 @@ def numpy_input_fn(x,
                    num_threads=1):
   """Returns input function that would feed dict of numpy arrays into the model.
 
-  This returns a function outputting `features` and `target` based on the dict
-  of numpy arrays. The dict `features` has the same keys as the `x`.
+  This returns a function outputting `features` and `targets` based on the dict
+  of numpy arrays. The dict `features` has the same keys as the `x`. The dict
+  `targets` has the same keys as the `y` if `y` is a dict.
 
   Example:
 
@@ -69,7 +71,7 @@ def numpy_input_fn(x,
 
   Args:
     x: dict of numpy array object.
-    y: numpy array object. `None` if absent.
+    y: numpy array object or dict of numpy array object. `None` if absent.
     batch_size: Integer, size of batches to return.
     num_epochs: Integer, number of epochs to iterate over data. If `None` will
       run forever.
@@ -81,11 +83,13 @@ def numpy_input_fn(x,
       such as in prediction and evaluation mode, `num_threads` should be 1.
 
   Returns:
-    Function, that has signature of ()->(dict of `features`, `target`)
+    Function, that has signature of ()->(dict of `features`, `targets`)
 
   Raises:
     ValueError: if the shape of `y` mismatches the shape of values in `x` (i.e.,
       values in `x` have same shape).
+    ValueError: if duplicate keys are in both `x` and `y` when `y` is a dict.
+    ValueError: if x or y is an empty dict.
     TypeError: `x` is not a dict or `shuffle` is not bool.
   """
 
@@ -97,43 +101,76 @@ def numpy_input_fn(x,
     """Numpy input function."""
     if not isinstance(x, dict):
       raise TypeError('x must be dict; got {}'.format(type(x).__name__))
+    if not x:
+      raise ValueError('x cannot be empty')
 
     # Make a shadow copy and also ensure the order of iteration is consistent.
-    ordered_dict_x = collections.OrderedDict(
+    ordered_dict_data = collections.OrderedDict(
         sorted(x.items(), key=lambda t: t[0]))
+    # Deep copy keys which is a view in python 3
+    feature_keys = list(ordered_dict_data.keys())
+
+    if y is None:
+      target_keys = None
+    elif isinstance(y, dict):
+      if not y:
+        raise ValueError('y cannot be empty dict, use None instead.')
+
+      ordered_dict_y = collections.OrderedDict(
+        sorted(y.items(), key=lambda t: t[0]))
+      target_keys = list(ordered_dict_y.keys())
+
+      duplicate_keys = set(feature_keys).intersection(set(target_keys))
+      if len(duplicate_keys):
+        raise ValueError('{} duplicate keys are found in both x and y: '
+                         '{}'.format(len(duplicate_keys), duplicate_keys))
+
+      ordered_dict_data.update(ordered_dict_y)
+    else:
+      target_keys = _get_unique_target_key(ordered_dict_data)
+      ordered_dict_data[target_keys] = y
+
+    if len(set(v.shape[0] for v in ordered_dict_data.values())) != 1:
+      shape_dict_of_x = {k: ordered_dict_data[k].shape
+                         for k in feature_keys}
+
+      if target_keys is None:
+        shape_of_y = None
+      elif isinstance(target_keys, string_types):
+        shape_of_y = y.shape
+      else:
+        shape_of_y = {k: ordered_dict_data[k].shape
+                      for k in target_keys}
 
-    unique_target_key = _get_unique_target_key(ordered_dict_x)
-    if y is not None:
-      ordered_dict_x[unique_target_key] = y
-
-    if len(set(v.shape[0] for v in ordered_dict_x.values())) != 1:
-      shape_dict_of_x = {k: ordered_dict_x[k].shape
-                         for k in ordered_dict_x.keys()}
-      shape_of_y = None if y is None else y.shape
       raise ValueError('Length of tensors in x and y is mismatched. All '
                        'elements in x and y must have the same length.\n'
                        'Shapes in x: {}\n'
-                       'Shape for y: {}\n'.format(shape_dict_of_x, shape_of_y))
+                       'Shapes in y: {}\n'.format(shape_dict_of_x, shape_of_y))
 
     queue = feeding_functions._enqueue_data(  # pylint: disable=protected-access
-        ordered_dict_x,
+        ordered_dict_data,
         queue_capacity,
         shuffle=shuffle,
         num_threads=num_threads,
         enqueue_size=batch_size,
         num_epochs=num_epochs)
 
-    features = (queue.dequeue_many(batch_size) if num_epochs is None
+    batch = (queue.dequeue_many(batch_size) if num_epochs is None
                 else queue.dequeue_up_to(batch_size))
 
-    # Remove the first `Tensor` in `features`, which is the row number.
-    if len(features) > 0:
-      features.pop(0)
+    # Remove the first `Tensor` in `batch`, which is the row number.
+    if len(batch) > 0:
+      batch.pop(0)
 
-    features = dict(zip(ordered_dict_x.keys(), features))
-    if y is not None:
-      target = features.pop(unique_target_key)
+    features = dict(zip(feature_keys, batch[:len(feature_keys)]))
+    if target_keys is None:
+      # TODO(martinwicke), return consistent result
+      return features
+    elif isinstance(target_keys, string_types):
+      target = batch[-1]
+      return features, target
+    else:
+      target = dict(zip(target_keys, batch[-len(target_keys):]))
       return features, target
-    return features
 
   return input_fn
diff --git a/tensorflow/python/estimator/inputs/numpy_io_test.py b/tensorflow/python/estimator/inputs/numpy_io_test.py
index 02df22b632..65eae7a7dc 100644
--- a/tensorflow/python/estimator/inputs/numpy_io_test.py
+++ b/tensorflow/python/estimator/inputs/numpy_io_test.py
@@ -239,6 +239,40 @@ class NumpyIoTest(test.TestCase):
             x, y, batch_size=2, shuffle=False, num_epochs=1)
         failing_input_fn()
 
+  def testNumpyInputFnWithXIsEmptyDict(self):
+    x = {}
+    y = np.arange(4)
+    with self.test_session():
+      with self.assertRaisesRegexp(ValueError, 'x cannot be empty'):
+        failing_input_fn = numpy_io.numpy_input_fn(x, y, shuffle=False)
+        failing_input_fn()
+
+  def testNumpyInputFnWithYIsNone(self):
+    a = np.arange(4) * 1.0
+    b = np.arange(32, 36)
+    x = {'a': a, 'b': b}
+    y = None
+
+    with self.test_session() as session:
+      input_fn = numpy_io.numpy_input_fn(
+        x, y, batch_size=2, shuffle=False, num_epochs=1)
+      features_tensor = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      feature = session.run(features_tensor)
+      self.assertEqual(len(feature), 2)
+      self.assertAllEqual(feature['a'], [0, 1])
+      self.assertAllEqual(feature['b'], [32, 33])
+
+      session.run([features_tensor])
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features_tensor])
+
+      coord.request_stop()
+      coord.join(threads)
+
   def testNumpyInputFnWithNonBoolShuffle(self):
     x = np.arange(32, 36)
     y = np.arange(4)
@@ -285,6 +319,59 @@ class NumpyIoTest(test.TestCase):
             num_epochs=1)
         failing_input_fn()
 
+  def testNumpyInputFnWithYAsDict(self):
+    a = np.arange(4) * 1.0
+    b = np.arange(32, 36)
+    x = {'a': a, 'b': b}
+    y = {'y1': np.arange(-32, -28), 'y2': np.arange(32, 28, -1)}
+
+    with self.test_session() as session:
+      input_fn = numpy_io.numpy_input_fn(
+        x, y, batch_size=2, shuffle=False, num_epochs=1)
+      features_tensor, targets_tensor = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      features, targets = session.run([features_tensor, targets_tensor])
+      self.assertEqual(len(features), 2)
+      self.assertAllEqual(features['a'], [0, 1])
+      self.assertAllEqual(features['b'], [32, 33])
+      self.assertEqual(len(targets), 2)
+      self.assertAllEqual(targets['y1'], [-32, -31])
+      self.assertAllEqual(targets['y2'], [32, 31])
+
+      session.run([features_tensor, targets_tensor])
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features_tensor, targets_tensor])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testNumpyInputFnWithYIsEmptyDict(self):
+    a = np.arange(4) * 1.0
+    b = np.arange(32, 36)
+    x = {'a': a, 'b': b}
+    y = {}
+    with self.test_session():
+      with self.assertRaisesRegexp(ValueError, 'y cannot be empty'):
+        failing_input_fn = numpy_io.numpy_input_fn(x, y, shuffle=False)
+        failing_input_fn()
+
+  def testNumpyInputFnWithDuplicateKeysInXAndY(self):
+    a = np.arange(4) * 1.0
+    b = np.arange(32, 36)
+    x = {'a': a, 'b': b}
+    y = {'y1': np.arange(-32, -28),
+         'a': a,
+         'y2': np.arange(32, 28, -1),
+         'b': b}
+    with self.test_session():
+      with self.assertRaisesRegexp(
+              ValueError, '2 duplicate keys are found in both x and y'):
+        failing_input_fn = numpy_io.numpy_input_fn(x, y, shuffle=False)
+        failing_input_fn()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 29cf223724..cef3f8d4c4 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -100,7 +100,7 @@ class Defun(object):
          grad_func - (optional).  A function implementing the gradient
            of the function-to-register.  This is must be a
            `_DefinedFunction` object. The gradient
-           function must satisfy the criterion defined in
+           function must satisify the criterion defined in
            function.proto:GradientDef.
 
          python_grad_func - (optional).  A function implementing the
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index ba43e9199b..36b0737cfc 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -370,7 +370,7 @@ class FunctionTest(test.TestCase):
 
     @function.Defun(dtypes.float32)
     def Foo(x):
-      y = logging_ops.Print(x, [], "Hello")
+      y = logging_ops.Print(x, [x], "Hello")
       with ops.control_dependencies([y]):
         z = control_flow_ops.no_op()
       with ops.control_dependencies([z]):
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index ad2e2993c1..ab4455534e 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -617,16 +617,15 @@ class _EagerTensorBase(Tensor):
     return dtypes._INTERN_TABLE[self._datatype_enum()]  # pylint: disable=protected-access
 
   def numpy(self):
-    """Returns a numpy array or a scalar with the same contents as the Tensor.
+    """Returns a numpy array with the same contents as the Tensor.
 
     TODO(ashankar,agarwal): Perhaps this should NOT reference the underlying
     buffer but instead always explicitly copy? Note that currently it may or may
     not copy based on whether the numpy data is properly aligned or not.
 
     Returns:
-      A numpy array or a scalar. Numpy array may share memory with the
-      Tensor object. Any changes to one may be reflected in the other. A scalar
-      value is returned when self has rank 0.
+      A numpy array that may share memory with the Tensor object. Any changes
+      to one may be reflected in the other.
 
     Raises:
       ValueError: if the type of this Tensor is not representable in numpy.
@@ -864,6 +863,10 @@ def convert_to_tensor(value, dtype=None, name=None, preferred_dtype=None):
   inputs, which allows those ops to accept numpy arrays, Python lists,
   and scalars in addition to `Tensor` objects.
 
+  Note: This function diverges from default Numpy behavior for `float` and
+    `string` types when `None` is present in a Python list or scalar. Rather
+    than silently converting `None` values, an error will be thrown.
+
   Args:
     value: An object whose type has a registered `Tensor` conversion function.
     dtype: Optional element type for the returned tensor. If missing, the
@@ -1641,15 +1644,13 @@ class Operation(object):
     default_colocation_group = [
         compat.as_bytes("loc:@%s" % self._node_def.name)
     ]
-    try:
-      class_attr = self.get_attr("_class")
-    except ValueError:
+    if "_class" not in self._node_def.attr:
       # This op has no explicit colocation group, so it is itself its
       # own root of a colocation group.
       return default_colocation_group
 
     attr_groups = [
-        class_name for class_name in class_attr
+        class_name for class_name in self.get_attr("_class")
         if class_name.startswith(b"loc:@")
     ]
 
@@ -2064,19 +2065,16 @@ class Operation(object):
 
   def _set_attr(self, attr_name, attr_value):
     """Private method used to set an attribute in the node_def."""
-    if _USE_C_API:
-      buf = c_api.TF_NewBufferFromString(
-          compat.as_bytes(attr_value.SerializeToString()))
-      try:
-        with errors.raise_exception_on_not_ok_status() as status:
-          # pylint: disable=protected-access
-          c_api.SetAttr(self._graph._c_graph, self._c_op, attr_name, buf,
-                        status)
-          # pylint: enable=protected-access
-      finally:
-        c_api.TF_DeleteBuffer(buf)
-    else:
-      self._node_def.attr[attr_name].CopyFrom(attr_value)
+    if not _USE_C_API:
+      assert "_set_attr not supported with _USE_C_API == False"
+      return
+    buf = c_api.TF_NewBufferFromString(
+        compat.as_bytes(attr_value.SerializeToString()))
+    try:
+      with errors.raise_exception_on_not_ok_status() as status:
+        c_api.SetAttr(self._graph._c_graph, self._c_op, attr_name, buf, status)  # pylint: disable=protected-access
+    finally:
+      c_api.TF_DeleteBuffer(buf)
 
   def get_attr(self, name):
     """Returns the value of the attr of this op with the given `name`.
@@ -2090,24 +2088,25 @@ class Operation(object):
     Raises:
       ValueError: If this op does not have an attr with the given `name`.
     """
-    fields = ["s", "i", "f", "b", "type", "shape", "tensor", "func"]
-    if self._c_op:
+    if _USE_C_API:
       try:
-        with c_api_util.tf_buffer() as buf:
-          with errors.raise_exception_on_not_ok_status() as status:
-            c_api.TF_OperationGetAttrValueProto(self._c_op, name, buf, status)
-          data = c_api.TF_GetBuffer(buf)
-      except errors.InvalidArgumentError as e:
-        # Convert to ValueError for backwards compatibility.
-        raise ValueError(str(e))
-      x = attr_value_pb2.AttrValue()
-      x.ParseFromString(data)
-    else:
-      if name not in self._node_def.attr:
-        raise ValueError(
-            "No attr named '" + name + "' in " + str(self._node_def))
-      x = self._node_def.attr[name]
+        # TODO(b/65162920): remove this try/except block when all attrs are
+        # implemented to use the _set_attr method instead of node_def.attr.
+        with errors.raise_exception_on_not_ok_status() as status:
+          metadata = c_api.TF_OperationGetAttrMetadata(self._c_op, name, status)
+        with errors.raise_exception_on_not_ok_status() as status:
+          if metadata.type == c_api.TF_ATTR_INT and metadata.is_list == 0:
+            return c_api.TF_OperationGetAttrInt(self._c_op, name, status)
+      except errors.InvalidArgumentError:
+        # Colocation ops are failing to find attrs begininning with "_*". They
+        # should fall through to the not-CAPI logic until the attribute is set
+        # via the C-API always.
+        pass
 
+    fields = ["s", "i", "f", "b", "type", "shape", "tensor", "func"]
+    if name not in self._node_def.attr:
+      raise ValueError("No attr named '" + name + "' in " + str(self._node_def))
+    x = self._node_def.attr[name]
     # Treat an empty oneof value as an empty list.
     if not x.WhichOneof("value"):
       return []
@@ -3107,10 +3106,9 @@ class Graph(object):
             ret._set_device(colocation_op.device)  # pylint: disable=protected-access
 
       all_colocation_groups = sorted(set(all_colocation_groups))
-      # pylint: disable=protected-access
-      ret._set_attr("_class", attr_value_pb2.AttrValue(
-          list=attr_value_pb2.AttrValue.ListValue(s=all_colocation_groups)))
-      # pylint: enable=protected-access
+      ret.node_def.attr["_class"].CopyFrom(
+          attr_value_pb2.AttrValue(list=attr_value_pb2.AttrValue.ListValue(
+              s=all_colocation_groups)))
 
     # Sets "container" attribute if
     # (1) self._container is not None
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 4e931e00c5..3087d6060b 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -31,11 +31,9 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework import versions
@@ -359,55 +357,54 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual("<tf.Operation 'op1' type=None>", repr(op))
 
   def testGetAttr(self):
-    op = test_ops.default_attrs()
-    self.assertEqual(op.get_attr("string_val"), b"abc")
-    self.assertEqual(op.get_attr("string_list_val"), [b"abc", b""])
-    self.assertEqual(op.get_attr("int_val"), 123)
-    self.assertEqual(op.get_attr("int_list_val"), [1, 2, 3])
-    self.assertEqual(op.get_attr("float_val"), 10.0)
-    self.assertEqual(op.get_attr("float_list_val"), [10.0])
-    self.assertEqual(op.get_attr("bool_val"), True)
-    self.assertEqual(op.get_attr("bool_list_val"), [True, False])
-    self.assertEqual(op.get_attr("shape_val"),
-                     tensor_shape.as_shape([2, 1]).as_proto())
-    self.assertEqual(op.get_attr("shape_list_val"),
-                     [tensor_shape.as_shape([]).as_proto(),
-                      tensor_shape.as_shape([1]).as_proto()])
-    self.assertEqual(op.get_attr("tensor_val"),
-                     tensor_util.make_tensor_proto(1, dtypes.int32))
-    self.assertEqual(op.get_attr("tensor_list_val"),
-                     [tensor_util.make_tensor_proto(1, dtypes.int32)])
-
-    type_val = op.get_attr("type_val")
-    # First check that type_val is a DType, because the assertEquals will work
-    # no matter what since DType overrides __eq__
-    self.assertIsInstance(type_val, dtypes.DType)
-    self.assertEqual(type_val, dtypes.int32)
-
-    type_list_val = op.get_attr("type_list_val")
-    self.assertTrue(all(isinstance(x, dtypes.DType) for x in type_list_val))
-    self.assertEqual(type_list_val, [dtypes.int32, dtypes.float32])
-
-    @function.Defun(dtypes.float32, func_name="MyFunc")
-    def func(x):
-      return x
-
-    op = test_ops.func_attr(func)
-    self.assertEqual(op.get_attr("f"),
-                     attr_value_pb2.NameAttrList(name="MyFunc"))
-
-    # Try fetching missing attr
+    # TODO(b/65162920): implement all tests for get_attr with C API
     if ops._USE_C_API:
-      error_msg = "Operation 'FuncAttr' has no attr named 'FakeAttr'."
-    else:
-      error_msg = "No attr named 'FakeAttr' in name: \"FuncAttr\""
+      op = test_ops.int_attr().op
+      self.assertEqual(op.get_attr("foo"), 1)
+
+      op_str = test_ops.string_list_attr(a=["z"], b="y")
+      self.assertEqual(op_str.get_attr("a"), [b"z"])
+      self.assertEqual(op_str.get_attr("b"), b"y")
 
-    with self.assertRaisesRegexp(ValueError, error_msg):
-      op.get_attr("FakeAttr")
+    else:
+      list_value = attr_value_pb2.AttrValue.ListValue()
+
+      list_value.type.append(types_pb2.DT_STRING)
+      list_value.type.append(types_pb2.DT_DOUBLE)
+      op = ops.Operation(
+          ops._NodeDef(
+              "None",
+              "op1",
+              attrs={
+                  "value":
+                      attr_value_pb2.AttrValue(i=32),
+                  "dtype":
+                      attr_value_pb2.AttrValue(type=types_pb2.DT_INT32),
+                  "list":
+                      attr_value_pb2.AttrValue(list=list_value),
+                  "func":
+                      attr_value_pb2.AttrValue(
+                          func=attr_value_pb2.NameAttrList())
+              }), ops.Graph(), [], [dtypes.int32])
+      self.assertEqual(32, op.get_attr("value"))
+      self.assertEqual("", op.get_attr("func").name)
+
+      d = op.get_attr("dtype")
+      # First check that d is a DType, because the assertEquals will
+      # work no matter what since DType overrides __eq__
+      self.assertIsInstance(d, dtypes.DType)
+      self.assertEqual(dtypes.int32, d)
+
+      l = op.get_attr("list")
+      for x in l:
+        self.assertIsInstance(x, dtypes.DType)
+      self.assertEqual([dtypes.string, dtypes.double], l)
 
   # TODO(b/65162920): remove this test when users who are directly mutating the
   # node_def have been updated to proper usage.
   def testSetAttr(self):
+    if not ops._USE_C_API:
+      return
     op = test_ops.int_attr().op
     op._set_attr("foo", attr_value_pb2.AttrValue(i=2))
     # TODO(skyewm): add node_def check
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 7e74c19124..e283542172 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -286,6 +286,7 @@ _TF_TO_IS_OK = {
     dtypes.bool: [_FilterBool],
     dtypes.complex128: [_FilterComplex],
     dtypes.complex64: [_FilterComplex],
+    dtypes.float16: [_FilterFloat],
     dtypes.float32: [_FilterFloat],
     dtypes.float64: [_FilterFloat],
     dtypes.int16: [_FilterInt],
diff --git a/tensorflow/python/framework/test_ops.cc b/tensorflow/python/framework/test_ops.cc
index 35e0167b26..a8b7fc543f 100644
--- a/tensorflow/python/framework/test_ops.cc
+++ b/tensorflow/python/framework/test_ops.cc
@@ -341,27 +341,4 @@ REGISTER_OP("StringListAttr")
     .Attr("b: string")
     .SetShapeFn(shape_inference::UnknownShape);
 
-REGISTER_OP("DefaultAttrs")
-    .Attr("string_val: string = 'abc'")
-    .Attr("string_list_val: list(string) = ['abc', '']")
-    .Attr("int_val: int = 123")
-    .Attr("int_list_val: list(int) = [1, 2, 3]")
-    .Attr("float_val: float = 10.0")
-    .Attr("float_list_val: list(float) = [10.0]")
-    .Attr("bool_val: bool = true")
-    .Attr("bool_list_val: list(bool) = [true, false]")
-    .Attr("type_val: type = DT_INT32")
-    .Attr("type_list_val: list(type) = [DT_INT32, DT_FLOAT]")
-    .Attr("shape_val: shape = { dim { size: 2 } dim { size: 1 } }")
-    .Attr("shape_list_val: list(shape) = [{}, { dim { size: 1} }]")
-    .Attr("tensor_val: tensor = { dtype: DT_INT32 tensor_shape: {} int_val: 1}")
-    .Attr(
-        "tensor_list_val: list(tensor) = "
-        "[{ dtype: DT_INT32 tensor_shape: {} int_val: 1}]")
-    .SetShapeFn(shape_inference::UnknownShape);
-
-REGISTER_OP("FuncAttr")
-    .Attr("f: func")
-    .SetShapeFn(shape_inference::UnknownShape);
-
 }  // end namespace tensorflow
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 6e3a35af3c..d796b0ebea 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -984,8 +984,9 @@ class TensorFlowTestCase(googletest.TestCase):
       err: A float value.
       msg: An optional string message to append to the failure message.
     """
+    # f1 == f2 is needed here as we might have: f1, f2 = inf, inf
     self.assertTrue(
-        math.fabs(f1 - f2) <= err,
+        f1 == f2 or math.fabs(f1 - f2) <= err,
         "%f != %f +/- %f%s" % (f1, f2, err, " (%s)" % msg
                                if msg is not None else ""))
 
diff --git a/tensorflow/python/grappler/model_analyzer.cc b/tensorflow/python/grappler/model_analyzer.cc
index 7d365c3be9..4ec7620bce 100644
--- a/tensorflow/python/grappler/model_analyzer.cc
+++ b/tensorflow/python/grappler/model_analyzer.cc
@@ -59,15 +59,10 @@ void ModelAnalyzer::PrintNodeInfo(const NodeDef* node,
           if (i > 0) {
             os << ", ";
           }
-          if (prop.shape().dim(i).size() >= 0) {
-            // Print the actual dimension.
-            os << prop.shape().dim(i).size();
-          } else if (prop.shape().dim(i).size() == -1) {
-            // We don't know anything about the dimension.
+          if (prop.shape().dim(i).size() < 0) {
             os << "?";
           } else {
-            // Symbolic dimension.
-            os << "x" << -prop.shape().dim(i).size();
+            os << prop.shape().dim(i).size();
           }
         }
         os << "]";
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 6a762ee5d2..4db48b45ed 100644
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -499,18 +499,6 @@ py_test(
 )
 
 py_test(
-    name = "recurrent_test",
-    size = "small",
-    srcs = ["_impl/keras/layers/recurrent_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
     name = "serialization_test",
     size = "small",
     srcs = ["_impl/keras/layers/serialization_test.py"],
diff --git a/tensorflow/python/keras/_impl/keras/engine/topology.py b/tensorflow/python/keras/_impl/keras/engine/topology.py
index 2bcbabf19c..f9be782f85 100644
--- a/tensorflow/python/keras/_impl/keras/engine/topology.py
+++ b/tensorflow/python/keras/_impl/keras/engine/topology.py
@@ -29,9 +29,6 @@ from six.moves import zip  # pylint: disable=redefined-builtin
 from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.utils import conv_utils
 from tensorflow.python.keras._impl.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.keras._impl.keras.utils.layer_utils import print_summary as print_layer_summary
@@ -212,9 +209,9 @@ class Layer(tf_base_layers.Layer):
       dtype = K.floatx()
     weight = self.add_variable(name, shape,
                                dtype=dtype,
-                               initializer=initializers.get(initializer),
-                               regularizer=regularizers.get(regularizer),
-                               constraint=constraints.get(constraint),
+                               initializer=initializer,
+                               regularizer=regularizer,
+                               constraint=constraint,
                                trainable=trainable)
     return weight
 
diff --git a/tensorflow/python/keras/_impl/keras/integration_test.py b/tensorflow/python/keras/_impl/keras/integration_test.py
index 871a8c7329..7110036848 100644
--- a/tensorflow/python/keras/_impl/keras/integration_test.py
+++ b/tensorflow/python/keras/_impl/keras/integration_test.py
@@ -93,7 +93,7 @@ class KerasIntegrationTest(test.TestCase):
       y_test = keras.utils.to_categorical(y_test)
 
       model = keras.models.Sequential()
-      model.add(keras.layers.LSTM(5, return_sequences=True,
+      model.add(keras.layers.LSTM(3, return_sequences=True,
                                   input_shape=x_train.shape[1:]))
       model.add(keras.layers.GRU(y_train.shape[-1], activation='softmax'))
       model.compile(loss='categorical_crossentropy',
diff --git a/tensorflow/python/keras/_impl/keras/layers/gru_test.py b/tensorflow/python/keras/_impl/keras/layers/gru_test.py
index c57fbac41c..03f0736161 100644
--- a/tensorflow/python/keras/_impl/keras/layers/gru_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/gru_test.py
@@ -156,10 +156,8 @@ class GRULayerTest(test.TestCase):
           activity_regularizer='l1')
       layer.build((None, None, 2))
       self.assertEqual(len(layer.losses), 3)
-
-      x = keras.backend.variable(np.ones((2, 3, 2)))
-      layer(x)
-      self.assertEqual(len(layer.get_losses_for(x)), 1)
+      layer(keras.backend.variable(np.ones((2, 3, 2))))
+      self.assertEqual(len(layer.losses), 4)
 
   def test_constraints_GRU(self):
     embedding_dim = 4
@@ -177,9 +175,9 @@ class GRULayerTest(test.TestCase):
           recurrent_constraint=r_constraint,
           bias_constraint=b_constraint)
       layer.build((None, None, embedding_dim))
-      self.assertEqual(layer.cell.kernel.constraint, k_constraint)
-      self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
-      self.assertEqual(layer.cell.bias.constraint, b_constraint)
+      self.assertEqual(layer.kernel.constraint, k_constraint)
+      self.assertEqual(layer.recurrent_kernel.constraint, r_constraint)
+      self.assertEqual(layer.bias.constraint, b_constraint)
 
   def test_with_masking_layer_GRU(self):
     layer_class = keras.layers.GRU
diff --git a/tensorflow/python/keras/_impl/keras/layers/lstm_test.py b/tensorflow/python/keras/_impl/keras/layers/lstm_test.py
index 8d359bf17c..f43d90fec8 100644
--- a/tensorflow/python/keras/_impl/keras/layers/lstm_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/lstm_test.py
@@ -156,9 +156,8 @@ class LSTMLayerTest(test.TestCase):
           activity_regularizer='l1')
       layer.build((None, None, 2))
       self.assertEqual(len(layer.losses), 3)
-      x = keras.backend.variable(np.ones((2, 3, 2)))
-      layer(x)
-      self.assertEqual(len(layer.get_losses_for(x)), 1)
+      layer(keras.backend.variable(np.ones((2, 3, 2))))
+      self.assertEqual(len(layer.losses), 4)
 
   def test_constraints_LSTM(self):
     embedding_dim = 4
@@ -176,9 +175,9 @@ class LSTMLayerTest(test.TestCase):
           recurrent_constraint=r_constraint,
           bias_constraint=b_constraint)
       layer.build((None, None, embedding_dim))
-      self.assertEqual(layer.cell.kernel.constraint, k_constraint)
-      self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
-      self.assertEqual(layer.cell.bias.constraint, b_constraint)
+      self.assertEqual(layer.kernel.constraint, k_constraint)
+      self.assertEqual(layer.recurrent_kernel.constraint, r_constraint)
+      self.assertEqual(layer.bias.constraint, b_constraint)
 
   def test_with_masking_layer_LSTM(self):
     layer_class = keras.layers.LSTM
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent.py b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
index 2bc74d5f80..139523403c 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -29,209 +29,99 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
-from tensorflow.python.platform import tf_logging as logging
 
 
-class StackedRNNCells(Layer):
-  """Wrapper allowing a stack of RNN cells to behave as a single cell.
+# pylint: disable=access-member-before-definition
 
-  Used to implement efficient stacked RNNs.
+
+def _time_distributed_dense(x,
+                            w,
+                            b=None,
+                            dropout=None,
+                            input_dim=None,
+                            output_dim=None,
+                            timesteps=None,
+                            training=None):
+  """Apply `y . w + b` for every temporal slice y of x.
 
   Arguments:
-      cells: List of RNN cell instances.
+      x: input tensor.
+      w: weight matrix.
+      b: optional bias vector.
+      dropout: whether to apply dropout (same dropout mask
+          for every temporal slice of the input).
+      input_dim: integer; optional dimensionality of the input.
+      output_dim: integer; optional dimensionality of the output.
+      timesteps: integer; optional number of timesteps.
+      training: training phase tensor or boolean.
+
+  Returns:
+      Output tensor.
+  """
+  if not input_dim:
+    input_dim = K.shape(x)[2]
+  if not timesteps:
+    timesteps = K.shape(x)[1]
+  if not output_dim:
+    output_dim = K.shape(w)[1]
+
+  if dropout is not None and 0. < dropout < 1.:
+    # apply the same dropout pattern at every timestep
+    ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim)))
+    dropout_matrix = K.dropout(ones, dropout)
+    expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps)
+    x = K.in_train_phase(x * expanded_dropout_matrix, x, training=training)
+
+  # collapse time dimension and batch dimension together
+  x = K.reshape(x, (-1, input_dim))
+  x = K.dot(x, w)
+  if b is not None:
+    x = K.bias_add(x, b)
+  # reshape to 3D tensor
+  if K.backend() == 'tensorflow':
+    x = K.reshape(x, K.stack([-1, timesteps, output_dim]))
+    x.set_shape([None, None, output_dim])
+  else:
+    x = K.reshape(x, (-1, timesteps, output_dim))
+  return x
 
-  Examples:
 
-  ```python
-      cells = [
-          keras.layers.LSTMCell(output_dim),
-          keras.layers.LSTMCell(output_dim),
-          keras.layers.LSTMCell(output_dim),
-      ]
-
-      inputs = keras.Input((timesteps, input_dim))
-      x = keras.layers.RNN(cells)(inputs)
-  ```
-  """
+class Recurrent(Layer):
+  """Abstract base class for recurrent layers.
 
-  def __init__(self, cells, **kwargs):
-    for cell in cells:
-      if not hasattr(cell, 'call'):
-        raise ValueError('All cells must have a `call` method. '
-                         'received cells:', cells)
-      if not hasattr(cell, 'state_size'):
-        raise ValueError('All cells must have a '
-                         '`state_size` attribute. '
-                         'received cells:', cells)
-    self.cells = cells
-    super(StackedRNNCells, self).__init__(**kwargs)
-
-  @property
-  def state_size(self):
-    # States are a flat list
-    # in reverse order of the cell stack.
-    # This allows to preserve the requirement
-    # `stack.state_size[0] == output_dim`.
-    # e.g. states of a 2-layer LSTM would be
-    # `[h2, c2, h1, c1]`
-    # (assuming one LSTM has states [h, c])
-    state_size = []
-    for cell in self.cells[::-1]:
-      if hasattr(cell.state_size, '__len__'):
-        state_size += list(cell.state_size)
-      else:
-        state_size.append(cell.state_size)
-    return tuple(state_size)
-
-  def call(self, inputs, states, **kwargs):
-    # Recover per-cell states.
-    nested_states = []
-    for cell in self.cells[::-1]:
-      if hasattr(cell.state_size, '__len__'):
-        nested_states.append(states[:len(cell.state_size)])
-        states = states[len(cell.state_size):]
-      else:
-        nested_states.append([states[0]])
-        states = states[1:]
-    nested_states = nested_states[::-1]
-
-    # Call the cells in order and store the returned states.
-    new_nested_states = []
-    for cell, states in zip(self.cells, nested_states):
-      inputs, states = cell.call(inputs, states, **kwargs)
-      new_nested_states.append(states)
-
-    # Format the new states as a flat list
-    # in reverse cell order.
-    states = []
-    for cell_states in new_nested_states[::-1]:
-      states += cell_states
-    return inputs, states
+  Do not use in a model -- it's not a valid layer!
+  Use its children classes `LSTM`, `GRU` and `SimpleRNN` instead.
 
-  def build(self, input_shape):
-    for cell in self.cells:
-      if isinstance(cell, Layer):
-        cell.build(input_shape)
-      if hasattr(cell.state_size, '__len__'):
-        output_dim = cell.state_size[0]
-      else:
-        output_dim = cell.state_size
-      input_shape = (input_shape[0], input_shape[1], output_dim)
-    self.built = True
+  All recurrent layers (`LSTM`, `GRU`, `SimpleRNN`) also
+  follow the specifications of this class and accept
+  the keyword arguments listed below.
 
-  def get_config(self):
-    cells = []
-    for cell in self.cells:
-      cells.append({
-          'class_name': cell.__class__.__name__,
-          'config': cell.get_config()
-      })
-    config = {'cells': cells}
-    base_config = super(StackedRNNCells, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+  Example:
 
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    from tensorflow.python.keras._impl.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
-    cells = []
-    for cell_config in config.pop('cells'):
-      cells.append(
-          deserialize_layer(cell_config, custom_objects=custom_objects))
-    return cls(cells, **config)
-
-  @property
-  def trainable_weights(self):
-    if not self.trainable:
-      return []
-    weights = []
-    for cell in self.cells:
-      if isinstance(cell, Layer):
-        weights += cell.trainable_weights
-    return weights
-
-  @property
-  def non_trainable_weights(self):
-    weights = []
-    for cell in self.cells:
-      if isinstance(cell, Layer):
-        weights += cell.non_trainable_weights
-    if not self.trainable:
-      trainable_weights = []
-      for cell in self.cells:
-        if isinstance(cell, Layer):
-          trainable_weights += cell.trainable_weights
-      return trainable_weights + weights
-    return weights
-
-  def get_weights(self):
-    """Retrieves the weights of the model.
-
-    Returns:
-        A flat list of Numpy arrays.
-    """
-    weights = []
-    for cell in self.cells:
-      if isinstance(cell, Layer):
-        weights += cell.weights
-    return K.batch_get_value(weights)
-
-  def set_weights(self, weights):
-    """Sets the weights of the model.
-
-    Arguments:
-        weights: A list of Numpy arrays with shapes and types matching
-            the output of `model.get_weights()`.
-    """
-    tuples = []
-    for cell in self.cells:
-      if isinstance(cell, Layer):
-        num_param = len(cell.weights)
-        weights = weights[:num_param]
-        for sw, w in zip(cell.weights, weights):
-          tuples.append((sw, w))
-        weights = weights[num_param:]
-    K.batch_set_value(tuples)
-
-  @property
-  def losses(self):
-    losses = []
-    for cell in self.cells:
-      if isinstance(cell, Layer):
-        cell_losses = cell.losses
-        losses += cell_losses
-    return losses
-
-  def get_losses_for(self, inputs=None):
-    losses = []
-    for cell in self.cells:
-      if isinstance(cell, Layer):
-        cell_losses = cell.get_losses_for(inputs)
-        losses += cell_losses
-    return losses
-
-
-class RNN(Layer):
-  """Base class for recurrent layers.
+  ```python
+      # as the first layer in a Sequential model
+      model = Sequential()
+      model.add(LSTM(32, input_shape=(10, 64)))
+      # now model.output_shape == (None, 32)
+      # note: `None` is the batch dimension.
+
+      # for subsequent layers, no need to specify the input size:
+      model.add(LSTM(16))
+
+      # to stack recurrent layers, you must use return_sequences=True
+      # on any recurrent layer that feeds into another recurrent layer.
+      # note that you only need to specify the input size on the first layer.
+      model = Sequential()
+      model.add(LSTM(64, input_dim=64, input_length=10, return_sequences=True))
+      model.add(LSTM(32, return_sequences=True))
+      model.add(LSTM(10))
+  ```
 
   Arguments:
-      cell: A RNN cell instance. A RNN cell is a class that has:
-          - a `call(input_at_t, states_at_t)` method, returning
-              `(output_at_t, states_at_t_plus_1)`. The call method of the
-              cell can also take the optional argument `constants`, see
-              section "Note on passing external constants" below.
-          - a `state_size` attribute. This can be a single integer
-              (single state) in which case it is
-              the size of the recurrent state
-              (which should be the same as the size of the cell output).
-              This can also be a list/tuple of integers
-              (one size per state). In this case, the first entry
-              (`state_size[0]`) should be the same as
-              the size of the cell output.
-          It is also possible for `cell` to be a list of RNN cell instances,
-          in which cases the cells get stacked on after the other in the RNN,
-          implementing an efficient stacked RNN.
-      return_sequences: Boolean. Whether to return the last output.
+      weights: list of Numpy arrays to set as initial weights.
+          The list should have 3 elements, of shapes:
+          `[(input_dim, output_dim), (output_dim, output_dim), (output_dim,)]`.
+      return_sequences: Boolean. Whether to return the last output
           in the output sequence, or the full sequence.
       return_state: Boolean. Whether to return the last state
           in addition to the output.
@@ -247,9 +137,21 @@ class RNN(Layer):
           Unrolling can speed-up a RNN,
           although it tends to be more memory-intensive.
           Unrolling is only suitable for short sequences.
+      implementation: one of {0, 1, or 2}.
+          If set to 0, the RNN will use
+          an implementation that uses fewer, larger matrix products,
+          thus running faster on CPU but consuming more memory.
+          If set to 1, the RNN will use more matrix products,
+          but smaller ones, thus running slower
+          (may actually be faster on GPU) while consuming less memory.
+          If set to 2 (LSTM/GRU only),
+          the RNN will combine the input gate,
+          the forget gate and the output gate into a single matrix,
+          enabling more time-efficient parallelization on the GPU.
+          Note: RNN dropout must be shared for all gates,
+          resulting in a slightly reduced regularization.
       input_dim: dimensionality of the input (integer).
-          This argument (or alternatively,
-          the keyword argument `input_shape`)
+          This argument (or alternatively, the keyword argument `input_shape`)
           is required when using this layer as the first layer in a model.
       input_length: Length of input sequences, to be specified
           when it is constant.
@@ -261,7 +163,7 @@ class RNN(Layer):
           at the level of the first layer
           (e.g. via the `input_shape` argument)
 
-  Input shape:
+  Input shape:s
       3D tensor with shape `(batch_size, timesteps, input_dim)`,
       (Optional) 2D tensors with shape `(batch_size, output_dim)`.
 
@@ -276,7 +178,7 @@ class RNN(Layer):
   # Masking
       This layer supports masking for input data with a variable number
       of timesteps. To introduce masks to your data,
-      use an [Embedding](embeddings.md) layer with the `mask_zero` parameter
+      use an `Embedding` layer with the `mask_zero` parameter
       set to `True`.
 
   # Note on using statefulness in RNNs
@@ -310,128 +212,42 @@ class RNN(Layer):
       calling `reset_states` with the keyword argument `states`. The value of
       `states` should be a numpy array or list of numpy arrays representing
       the initial state of the RNN layer.
-
-  # Note on passing external constants to RNNs
-      You can pass "external" constants to the cell using the `constants`
-      keyword argument of `RNN.__call__` (as well as `RNN.call`) method. This
-      requires that the `cell.call` method accepts the same keyword argument
-      `constants`. Such constants can be used to condition the cell
-      transformation on additional static inputs (not changing over time),
-      a.k.a. an attention mechanism.
-
-  Examples:
-
-  ```python
-      # First, let's define a RNN Cell, as a layer subclass.
-
-      class MinimalRNNCell(keras.layers.Layer):
-
-          def __init__(self, units, **kwargs):
-              self.units = units
-              self.state_size = units
-              super(MinimalRNNCell, self).__init__(**kwargs)
-
-          def build(self, input_shape):
-              self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
-                                            initializer='uniform',
-                                            name='kernel')
-              self.recurrent_kernel = self.add_weight(
-                  shape=(self.units, self.units),
-                  initializer='uniform',
-                  name='recurrent_kernel')
-              self.built = True
-
-          def call(self, inputs, states):
-              prev_output = states[0]
-              h = K.dot(inputs, self.kernel)
-              output = h + K.dot(prev_output, self.recurrent_kernel)
-              return output, [output]
-
-      # Let's use this cell in a RNN layer:
-
-      cell = MinimalRNNCell(32)
-      x = keras.Input((None, 5))
-      layer = RNN(cell)
-      y = layer(x)
-
-      # Here's how to use the cell to build a stacked RNN:
-
-      cells = [MinimalRNNCell(32), MinimalRNNCell(64)]
-      x = keras.Input((None, 5))
-      layer = RNN(cells)
-      y = layer(x)
-  ```
   """
 
   def __init__(self,
-               cell,
                return_sequences=False,
                return_state=False,
                go_backwards=False,
                stateful=False,
                unroll=False,
-               activity_regularizer=None,
+               implementation=0,
                **kwargs):
-    if isinstance(cell, (list, tuple)):
-      cell = StackedRNNCells(cell)
-    if not hasattr(cell, 'call'):
-      raise ValueError('`cell` should have a `call` method. '
-                       'The RNN was passed:', cell)
-    if not hasattr(cell, 'state_size'):
-      raise ValueError('The RNN cell should have '
-                       'an attribute `state_size` '
-                       '(tuple of integers, '
-                       'one integer per RNN state).')
-    super(RNN, self).__init__(
-        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
-    self.cell = cell
+    super(Recurrent, self).__init__(**kwargs)
     self.return_sequences = return_sequences
     self.return_state = return_state
     self.go_backwards = go_backwards
     self.stateful = stateful
     self.unroll = unroll
-
+    self.implementation = implementation
     self.supports_masking = True
     self.input_spec = [InputSpec(ndim=3)]
     self.state_spec = None
-    self._states = None
-    self.constants_spec = None
-    self._num_constants = None
-
-  @property
-  def states(self):
-    if self._states is None:
-      if isinstance(self.cell.state_size, int):
-        num_states = 1
-      else:
-        num_states = len(self.cell.state_size)
-      return [None for _ in range(num_states)]
-    return self._states
-
-  @states.setter
-  def states(self, states):
-    self._states = states
+    self.dropout = 0
+    self.recurrent_dropout = 0
 
   def _compute_output_shape(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
-
-    if hasattr(self.cell.state_size, '__len__'):
-      output_dim = self.cell.state_size[0]
-    else:
-      output_dim = self.cell.state_size
-
     if self.return_sequences:
-      output_shape = (input_shape[0], input_shape[1], output_dim)
+      output_shape = (input_shape[0], input_shape[1], self.units)
     else:
-      output_shape = (input_shape[0], output_dim)
+      output_shape = (input_shape[0], self.units)
 
     if self.return_state:
-      state_shape = [(input_shape[0], output_dim) for _ in self.states]
-      output_shape = [output_shape] + state_shape
-    else:
-      output_shape = output_shape
+      state_shape = [tensor_shape.TensorShape(
+          (input_shape[0], self.units)) for _ in self.states]
+      return [tensor_shape.TensorShape(output_shape)] + state_shape
     return tensor_shape.TensorShape(output_shape)
 
   def compute_mask(self, inputs, mask):
@@ -441,123 +257,82 @@ class RNN(Layer):
     if self.return_state:
       state_mask = [None for _ in self.states]
       return [output_mask] + state_mask
-    else:
-      return output_mask
-
-  def build(self, input_shape):
-    # Note input_shape will be list of shapes of initial states and
-    # constants if these are passed in __call__.
-    if self._num_constants is not None:
-      constants_shape = input_shape[-self._num_constants:]  # pylint: disable=invalid-unary-operand-type
-    else:
-      constants_shape = None
-
-    if isinstance(input_shape, list):
-      input_shape = input_shape[0]
-    input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
+    return output_mask
 
-    batch_size = input_shape[0] if self.stateful else None
-    input_dim = input_shape[-1]
-    self.input_spec[0] = InputSpec(shape=(batch_size, None, input_dim))
-
-    # allow cell (if layer) to build before we set or validate state_spec
-    if isinstance(self.cell, Layer):
-      step_input_shape = (input_shape[0],) + input_shape[2:]
-      if constants_shape is not None:
-        self.cell.build([step_input_shape] + constants_shape)
-      else:
-        self.cell.build(step_input_shape)
+  def step(self, inputs, states):
+    raise NotImplementedError
 
-    # set or validate state_spec
-    if hasattr(self.cell.state_size, '__len__'):
-      state_size = list(self.cell.state_size)
-    else:
-      state_size = [self.cell.state_size]
-
-    if self.state_spec is not None:
-      # initial_state was passed in call, check compatibility
-      if [spec.shape[-1] for spec in self.state_spec] != state_size:
-        raise ValueError(
-            'An initial_state was passed that is not compatible with '
-            '`cell.state_size`. Received `state_spec`={}; '
-            'However `cell.state_size` is '
-            '{}'.format(self.state_spec, self.cell.state_size))
-    else:
-      self.state_spec = [InputSpec(shape=(None, dim)) for dim in state_size]
-    if self.stateful:
-      self.reset_states()
+  def get_constants(self, inputs, training=None):
+    return []
 
   def get_initial_state(self, inputs):
     # build an all-zero tensor of shape (samples, output_dim)
     initial_state = K.zeros_like(inputs)  # (samples, timesteps, input_dim)
     initial_state = K.sum(initial_state, axis=(1, 2))  # (samples,)
     initial_state = K.expand_dims(initial_state)  # (samples, 1)
-    if hasattr(self.cell.state_size, '__len__'):
-      return [K.tile(initial_state, [1, dim]) for dim in self.cell.state_size]
-    else:
-      return [K.tile(initial_state, [1, self.cell.state_size])]
+    initial_state = K.tile(initial_state, [1,
+                                           self.units])  # (samples, output_dim)
+    initial_state = [initial_state for _ in range(len(self.states))]
+    return initial_state
 
-  def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
-    inputs, initial_state, constants = self._standardize_args(
-        inputs, initial_state, constants)
+  def preprocess_input(self, inputs, training=None):
+    return inputs
 
-    if initial_state is None and constants is None:
-      return super(RNN, self).__call__(inputs, **kwargs)
+  def __call__(self, inputs, initial_state=None, **kwargs):
+    if (isinstance(inputs, (list, tuple)) and
+        len(inputs) > 1
+        and initial_state is None):
+      initial_state = inputs[1:]
+      inputs = inputs[0]
 
-    # If any of `initial_state` or `constants` are specified and are Keras
-    # tensors, then add them to the inputs and temporarily modify the
-    # input_spec to include them.
+    # If `initial_state` is specified,
+    # and if it a Keras tensor,
+    # then add it to the inputs and temporarily
+    # modify the input spec to include the state.
+    if initial_state is None:
+      return super(Recurrent, self).__call__(inputs, **kwargs)
 
-    additional_inputs = []
-    additional_specs = []
-    if initial_state is not None:
-      kwargs['initial_state'] = initial_state
-      additional_inputs += initial_state
-      self.state_spec = [
-          InputSpec(shape=K.int_shape(state)) for state in initial_state
-      ]
-      additional_specs += self.state_spec
-    if constants is not None:
-      kwargs['constants'] = constants
-      additional_inputs += constants
-      self.constants_spec = [
-          InputSpec(shape=K.int_shape(constant)) for constant in constants
-      ]
-      self._num_constants = len(constants)
-      additional_specs += self.constants_spec
-    # at this point additional_inputs cannot be empty
-    is_keras_tensor = hasattr(additional_inputs[0], '_keras_history')
-    for tensor in additional_inputs:
+    if not isinstance(initial_state, (list, tuple)):
+      initial_state = [initial_state]
+
+    is_keras_tensor = hasattr(initial_state[0], '_keras_history')
+    for tensor in initial_state:
       if hasattr(tensor, '_keras_history') != is_keras_tensor:
-        raise ValueError('The initial state or constants of an RNN'
-                         ' layer cannot be specified with a mix of'
-                         ' Keras tensors and non-Keras tensors')
+        raise ValueError('The initial state of an RNN layer cannot be'
+                         ' specified with a mix of Keras tensors and'
+                         ' non-Keras tensors')
 
     if is_keras_tensor:
-      # Compute the full input spec, including state and constants
-      full_input = [inputs] + additional_inputs
-      full_input_spec = self.input_spec + additional_specs
-      # Perform the call with temporarily replaced input_spec
-      original_input_spec = self.input_spec
-      self.input_spec = full_input_spec
-      output = super(RNN, self).__call__(full_input, **kwargs)
-      self.input_spec = original_input_spec
+      # Compute the full input spec, including state
+      input_spec = self.input_spec
+      state_spec = self.state_spec
+      if not isinstance(input_spec, list):
+        input_spec = [input_spec]
+      if not isinstance(state_spec, list):
+        state_spec = [state_spec]
+      self.input_spec = input_spec + state_spec
+
+      # Compute the full inputs, including state
+      inputs = [inputs] + list(initial_state)
+
+      # Perform the call
+      output = super(Recurrent, self).__call__(inputs, **kwargs)
+
+      # Restore original input spec
+      self.input_spec = input_spec
       return output
     else:
-      return super(RNN, self).__call__(inputs, **kwargs)
-
-  def call(self,
-           inputs,
-           mask=None,
-           training=None,
-           initial_state=None,
-           constants=None):
+      kwargs['initial_state'] = initial_state
+      return super(Recurrent, self).__call__(inputs, **kwargs)
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
     # input shape: `(samples, time (padded with zeros), input_dim)`
     # note that the .build() method of subclasses MUST define
     # self.input_spec and self.state_spec with complete input shapes.
     if isinstance(inputs, list):
+      initial_state = inputs[1:]
       inputs = inputs[0]
-    if initial_state is not None:
+    elif initial_state is not None:
       pass
     elif self.stateful:
       initial_state = self.states
@@ -568,14 +343,13 @@ class RNN(Layer):
       mask = mask[0]
 
     if len(initial_state) != len(self.states):
-      raise ValueError(
-          'Layer has ' + str(len(self.states)) + ' states but was passed ' +
-          str(len(initial_state)) + ' initial states.')
+      raise ValueError('Layer has ' + str(len(self.states)) +
+                       ' states but was passed ' + str(len(initial_state)) +
+                       ' initial states.')
     input_shape = K.int_shape(inputs)
-    timesteps = input_shape[1]
-    if self.unroll and timesteps in [None, 1]:
+    if self.unroll and input_shape[1] is None:
       raise ValueError('Cannot unroll a RNN if the '
-                       'time dimension is undefined or equal to 1. \n'
+                       'time dimension is undefined. \n'
                        '- If using a Sequential model, '
                        'specify the time dimension by passing '
                        'an `input_shape` or `batch_input_shape` '
@@ -585,31 +359,15 @@ class RNN(Layer):
                        '- If using the functional API, specify '
                        'the time dimension by passing a `shape` '
                        'or `batch_shape` argument to your Input layer.')
-
-    kwargs = {}
-    if has_arg(self.cell.call, 'training'):
-      kwargs['training'] = training
-
-    if constants:
-      if not has_arg(self.cell.call, 'constants'):
-        raise ValueError('RNN cell does not support constants')
-
-      def step(inputs, states):
-        constants = states[-self._num_constants:]  # pylint: disable=invalid-unary-operand-type
-        states = states[:-self._num_constants]  # pylint: disable=invalid-unary-operand-type
-        return self.cell.call(inputs, states, constants=constants, **kwargs)
-    else:
-
-      def step(inputs, states):
-        return self.cell.call(inputs, states, **kwargs)
-
+    constants = self.get_constants(inputs, training=None)
+    preprocessed_input = self.preprocess_input(inputs, training=None)
     last_output, outputs, states = K.rnn(
-        step,
-        inputs,
+        self.step,
+        preprocessed_input,
         initial_state,
-        constants=constants,
         go_backwards=self.go_backwards,
         mask=mask,
+        constants=constants,
         unroll=self.unroll)
     if self.stateful:
       updates = []
@@ -617,63 +375,21 @@ class RNN(Layer):
         updates.append((self.states[i], states[i]))
       self.add_update(updates, inputs)
 
-    if self.return_sequences:
-      output = outputs
-    else:
-      output = last_output
-
     # Properly set learning phase
-    if getattr(last_output, '_uses_learning_phase', False):
-      output._uses_learning_phase = True
+    if 0 < self.dropout + self.recurrent_dropout:
+      last_output._uses_learning_phase = True
+      outputs._uses_learning_phase = True
+
+    if not self.return_sequences:
+      outputs = last_output
 
     if self.return_state:
       if not isinstance(states, (list, tuple)):
         states = [states]
       else:
         states = list(states)
-      return [output] + states
-    else:
-      return output
-
-  def _standardize_args(self, inputs, initial_state, constants):
-    """Standardize `__call__` arguments to a single list of tensor inputs.
-
-    When running a model loaded from file, the input tensors
-    `initial_state` and `constants` can be passed to `RNN.__call__` as part
-    of `inputs` instead of by the dedicated keyword arguments. This method
-    makes sure the arguments are separated and that `initial_state` and
-    `constants` are lists of tensors (or None).
-
-    Arguments:
-        inputs: tensor or list/tuple of tensors
-        initial_state: tensor or list of tensors or None
-        constants: tensor or list of tensors or None
-
-    Returns:
-        inputs: tensor
-        initial_state: list of tensors or None
-        constants: list of tensors or None
-    """
-    if isinstance(inputs, list):
-      assert initial_state is None and constants is None
-      if self._num_constants is not None:
-        constants = inputs[-self._num_constants:]  # pylint: disable=invalid-unary-operand-type
-        inputs = inputs[:-self._num_constants]  # pylint: disable=invalid-unary-operand-type
-      if len(inputs) > 1:
-        initial_state = inputs[1:]
-      inputs = inputs[0]
-
-    def to_list_or_none(x):
-      if x is None or isinstance(x, list):
-        return x
-      if isinstance(x, tuple):
-        return list(x)
-      return [x]
-
-    initial_state = to_list_or_none(initial_state)
-    constants = to_list_or_none(constants)
-
-    return inputs, initial_state, constants
+      return [outputs] + states
+    return outputs
 
   def reset_states(self, states=None):
     if not self.stateful:
@@ -692,19 +408,10 @@ class RNN(Layer):
                        '`batch_shape` argument to your Input layer.')
     # initialize state if None
     if self.states[0] is None:
-      if hasattr(self.cell.state_size, '__len__'):
-        self.states = [
-            K.zeros((batch_size, dim)) for dim in self.cell.state_size
-        ]
-      else:
-        self.states = [K.zeros((batch_size, self.cell.state_size))]
+      self.states = [K.zeros((batch_size, self.units)) for _ in self.states]
     elif states is None:
-      if hasattr(self.cell.state_size, '__len__'):
-        for state, dim in zip(self.states, self.cell.state_size):
-          K.set_value(state, np.zeros((batch_size, dim)))
-      else:
-        K.set_value(self.states[0], np.zeros((batch_size,
-                                              self.cell.state_size)))
+      for state in self.states:
+        K.set_value(state, np.zeros((batch_size, self.units)))
     else:
       if not isinstance(states, (list, tuple)):
         states = [states]
@@ -714,16 +421,11 @@ class RNN(Layer):
                          'but it received ' + str(len(states)) +
                          ' state values. Input received: ' + str(states))
       for index, (value, state) in enumerate(zip(states, self.states)):
-        if hasattr(self.cell.state_size, '__len__'):
-          dim = self.cell.state_size[index]
-        else:
-          dim = self.cell.state_size
-        if value.shape != (batch_size, dim):
-          raise ValueError(
-              'State ' + str(index) + ' is incompatible with layer ' +
-              self.name + ': expected shape=' + str(
-                  (batch_size, dim)) + ', found shape=' + str(value.shape))
-        # TODO(fchollet): consider batch calls to `set_value`.
+        if value.shape != (batch_size, self.units):
+          raise ValueError('State ' + str(index) +
+                           ' is incompatible with layer ' + self.name +
+                           ': expected shape=' + str((batch_size, self.units)) +
+                           ', found shape=' + str(value.shape))
         K.set_value(state, value)
 
   def get_config(self):
@@ -732,94 +434,51 @@ class RNN(Layer):
         'return_state': self.return_state,
         'go_backwards': self.go_backwards,
         'stateful': self.stateful,
-        'unroll': self.unroll
-    }
-    if self._num_constants is not None:
-      config['num_constants'] = self._num_constants
-
-    cell_config = self.cell.get_config()
-    config['cell'] = {
-        'class_name': self.cell.__class__.__name__,
-        'config': cell_config
+        'unroll': self.unroll,
+        'implementation': self.implementation
     }
-    base_config = super(RNN, self).get_config()
+    base_config = super(Recurrent, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    from tensorflow.python.keras._impl.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
-    cell = deserialize_layer(config.pop('cell'), custom_objects=custom_objects)
-    num_constants = config.pop('num_constants', None)
-    layer = cls(cell, **config)
-    layer._num_constants = num_constants
-    return layer
-
-  @property
-  def trainable_weights(self):
-    if isinstance(self.cell, Layer):
-      return self.cell.trainable_weights
-    return []
-
-  @property
-  def non_trainable_weights(self):
-    if isinstance(self.cell, Layer):
-      return self.cell.non_trainable_weights
-    return []
 
-  @property
-  def losses(self):
-    if isinstance(self.cell, Layer):
-      return self.cell.losses
-    return []
-
-  def get_losses_for(self, inputs=None):
-    if isinstance(self.cell, Layer):
-      cell_losses = self.cell.get_losses_for(inputs)
-      return cell_losses + super(RNN, self).get_losses_for(inputs)
-    return super(RNN, self).get_losses_for(inputs)
-
-
-class SimpleRNNCell(Layer):
-  """Cell class for SimpleRNN.
+class SimpleRNN(Recurrent):
+  """Fully-connected RNN where the output is to be fed back to input.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
+      activation: Activation function to use.
+          If you don't specify anything, no activation is applied
           If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
+          used for the linear transformation of the inputs..
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
-          used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+          used for the linear transformation of the recurrent state..
+      bias_initializer: Initializer for the bias vector.
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
       recurrent_dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the recurrent state.
+
+  References:
+      - [A Theoretically Grounded Application of Dropout in Recurrent Neural
+        Networks](http://arxiv.org/abs/1512.05287)
   """
 
   def __init__(self,
@@ -832,13 +491,15 @@ class SimpleRNNCell(Layer):
                kernel_regularizer=None,
                recurrent_regularizer=None,
                bias_regularizer=None,
+               activity_regularizer=None,
                kernel_constraint=None,
                recurrent_constraint=None,
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
                **kwargs):
-    super(SimpleRNNCell, self).__init__(**kwargs)
+    super(SimpleRNN, self).__init__(
+        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
     self.units = units
     self.activation = activations.get(activation)
     self.use_bias = use_bias
@@ -857,13 +518,23 @@ class SimpleRNNCell(Layer):
 
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
-    self.state_size = self.units
-    self._dropout_mask = None
-    self._recurrent_dropout_mask = None
+    self.state_spec = InputSpec(shape=(None, self.units))
 
   def build(self, input_shape):
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+
+    batch_size = input_shape[0] if self.stateful else None
+    self.input_dim = input_shape[2]
+    self.input_spec[0] = InputSpec(shape=(batch_size, None, self.input_dim))
+
+    self.states = [None]
+    if self.stateful:
+      self.reset_states()
+
     self.kernel = self.add_weight(
-        shape=(input_shape[-1], self.units),
+        shape=(self.input_dim, self.units),
         name='kernel',
         initializer=self.kernel_initializer,
         regularizer=self.kernel_regularizer,
@@ -885,327 +556,146 @@ class SimpleRNNCell(Layer):
       self.bias = None
     self.built = True
 
-  def _generate_dropout_mask(self, inputs, training=None):
-    if 0 < self.dropout < 1:
-      ones = K.ones_like(K.squeeze(inputs[:, 0:1, :], axis=1))
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      self._dropout_mask = K.in_train_phase(
-          dropped_inputs, ones, training=training)
+  def preprocess_input(self, inputs, training=None):
+    if self.implementation > 0:
+      return inputs
     else:
-      self._dropout_mask = None
+      input_shape = inputs.get_shape().as_list()
+      input_dim = input_shape[2]
+      timesteps = input_shape[1]
+      return _time_distributed_dense(
+          inputs,
+          self.kernel,
+          self.bias,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
 
-  def _generate_recurrent_dropout_mask(self, inputs, training=None):
-    if 0 < self.recurrent_dropout < 1:
-      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
-      ones = K.tile(ones, (1, self.units))
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      self._recurrent_dropout_mask = K.in_train_phase(
-          dropped_inputs, ones, training=training)
+  def step(self, inputs, states):
+    if self.implementation == 0:
+      h = inputs
     else:
-      self._recurrent_dropout_mask = None
+      if 0 < self.dropout < 1:
+        h = K.dot(inputs * states[1], self.kernel)
+      else:
+        h = K.dot(inputs, self.kernel)
+      if self.bias is not None:
+        h = K.bias_add(h, self.bias)
 
-  def call(self, inputs, states, training=None):
     prev_output = states[0]
-    dp_mask = self._dropout_mask
-    rec_dp_mask = self._recurrent_dropout_mask
-
-    if dp_mask is not None:
-      h = K.dot(inputs * dp_mask, self.kernel)
-    else:
-      h = K.dot(inputs, self.kernel)
-    if self.bias is not None:
-      h = K.bias_add(h, self.bias)
-
-    if rec_dp_mask is not None:
-      prev_output *= rec_dp_mask
+    if 0 < self.recurrent_dropout < 1:
+      prev_output *= states[2]
     output = h + K.dot(prev_output, self.recurrent_kernel)
     if self.activation is not None:
       output = self.activation(output)
 
     # Properly set learning phase on output tensor.
     if 0 < self.dropout + self.recurrent_dropout:
-      if training is None:
-        output._uses_learning_phase = True
+      output._uses_learning_phase = True
     return output, [output]
 
+  def get_constants(self, inputs, training=None):
+    constants = []
+    if self.implementation != 0 and 0 < self.dropout < 1:
+      input_shape = K.int_shape(inputs)
+      input_dim = input_shape[-1]
+      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
+      ones = K.tile(ones, (1, int(input_dim)))
 
-class SimpleRNN(RNN):
-  """Fully-connected RNN where the output is to be fed back to input.
-
-  Arguments:
-      units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
-          If you pass None, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
-      recurrent_initializer: Initializer for the `recurrent_kernel`
-          weights matrix,
-          used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
-      kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation").
-          (see [regularizer](../regularizers.md)).
-      kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
-      dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the inputs.
-      recurrent_dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the recurrent state.
-      return_sequences: Boolean. Whether to return the last output.
-          in the output sequence, or the full sequence.
-      return_state: Boolean. Whether to return the last state
-          in addition to the output.
-      go_backwards: Boolean (default False).
-          If True, process the input sequence backwards and return the
-          reversed sequence.
-      stateful: Boolean (default False). If True, the last state
-          for each sample at index i in a batch will be used as initial
-          state for the sample of index i in the following batch.
-      unroll: Boolean (default False).
-          If True, the network will be unrolled,
-          else a symbolic loop will be used.
-          Unrolling can speed-up a RNN,
-          although it tends to be more memory-intensive.
-          Unrolling is only suitable for short sequences.
-  """
-
-  def __init__(self,
-               units,
-               activation='tanh',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               dropout=0.,
-               recurrent_dropout=0.,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               unroll=False,
-               **kwargs):
-    if 'implementation' in kwargs:
-      kwargs.pop('implementation')
-      logging.warning('The `implementation` argument '
-                      'in `SimpleRNN` has been deprecated. '
-                      'Please remove it from your layer call.')
-    cell = SimpleRNNCell(
-        units,
-        activation=activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        recurrent_regularizer=recurrent_regularizer,
-        bias_regularizer=bias_regularizer,
-        kernel_constraint=kernel_constraint,
-        recurrent_constraint=recurrent_constraint,
-        bias_constraint=bias_constraint,
-        dropout=dropout,
-        recurrent_dropout=recurrent_dropout)
-    super(SimpleRNN, self).__init__(
-        cell,
-        return_sequences=return_sequences,
-        return_state=return_state,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        unroll=unroll,
-        activity_regularizer=regularizers.get(activity_regularizer),
-        **kwargs)
-    # self.activity_regularizer = regularizers.get(activity_regularizer)
-
-  def call(self, inputs, mask=None, training=None, initial_state=None):
-    self.cell._generate_dropout_mask(inputs, training=training)
-    self.cell._generate_recurrent_dropout_mask(inputs, training=training)
-    return super(SimpleRNN, self).call(
-        inputs, mask=mask, training=training, initial_state=initial_state)
-
-  @property
-  def units(self):
-    return self.cell.units
-
-  @property
-  def activation(self):
-    return self.cell.activation
-
-  @property
-  def use_bias(self):
-    return self.cell.use_bias
-
-  @property
-  def kernel_initializer(self):
-    return self.cell.kernel_initializer
-
-  @property
-  def recurrent_initializer(self):
-    return self.cell.recurrent_initializer
-
-  @property
-  def bias_initializer(self):
-    return self.cell.bias_initializer
-
-  @property
-  def kernel_regularizer(self):
-    return self.cell.kernel_regularizer
-
-  @property
-  def recurrent_regularizer(self):
-    return self.cell.recurrent_regularizer
-
-  @property
-  def bias_regularizer(self):
-    return self.cell.bias_regularizer
-
-  @property
-  def kernel_constraint(self):
-    return self.cell.kernel_constraint
+      def dropped_inputs():
+        return K.dropout(ones, self.dropout)
 
-  @property
-  def recurrent_constraint(self):
-    return self.cell.recurrent_constraint
+      dp_mask = K.in_train_phase(dropped_inputs, ones, training=training)
+      constants.append(dp_mask)
+    else:
+      constants.append(K.cast_to_floatx(1.))
 
-  @property
-  def bias_constraint(self):
-    return self.cell.bias_constraint
+    if 0 < self.recurrent_dropout < 1:
+      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
+      ones = K.tile(ones, (1, self.units))
 
-  @property
-  def dropout(self):
-    return self.cell.dropout
+      def dropped_inputs():  # pylint: disable=function-redefined
+        return K.dropout(ones, self.recurrent_dropout)
 
-  @property
-  def recurrent_dropout(self):
-    return self.cell.recurrent_dropout
+      rec_dp_mask = K.in_train_phase(dropped_inputs, ones, training=training)
+      constants.append(rec_dp_mask)
+    else:
+      constants.append(K.cast_to_floatx(1.))
+    return constants
 
   def get_config(self):
     config = {
-        'units':
-            self.units,
-        'activation':
-            activations.serialize(self.activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
+        'units': self.units,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
         'recurrent_initializer':
             initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
         'recurrent_regularizer':
             regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
         'recurrent_constraint':
             constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout
+        'bias_constraint': constraints.serialize(self.bias_constraint),
+        'dropout': self.dropout,
+        'recurrent_dropout': self.recurrent_dropout
     }
     base_config = super(SimpleRNN, self).get_config()
-    del base_config['cell']
     return dict(list(base_config.items()) + list(config.items()))
 
-  @classmethod
-  def from_config(cls, config):
-    if 'implementation' in config:
-      config.pop('implementation')
-    return cls(**config)
 
+class GRU(Recurrent):
+  """Gated Recurrent Unit - Cho et al.
 
-class GRUCell(Layer):
-  """Cell class for the GRU layer.
+  2014.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
+      activation: Activation function to use.
           If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       recurrent_activation: Activation function to use
-          for the recurrent step
-          (see [activations](../activations.md)).
+          for the recurrent step.
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
+          used for the linear transformation of the inputs..
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
-          used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+          used for the linear transformation of the recurrent state..
+      bias_initializer: Initializer for the bias vector.
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
       recurrent_dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the recurrent state.
-      implementation: Implementation mode, either 1 or 2.
-          Mode 1 will structure its operations as a larger number of
-          smaller dot products and additions, whereas mode 2 will
-          batch them into fewer, larger operations. These modes will
-          have different performance profiles on different hardware and
-          for different applications.
+
+  References:
+      - [On the Properties of Neural Machine Translation: Encoder-Decoder
+        Approaches](https://arxiv.org/abs/1409.1259)
+      - [Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
+        Modeling](http://arxiv.org/abs/1412.3555v1)
+      - [A Theoretically Grounded Application of Dropout in Recurrent Neural
+        Networks](http://arxiv.org/abs/1512.05287)
   """
 
   def __init__(self,
@@ -1219,14 +709,15 @@ class GRUCell(Layer):
                kernel_regularizer=None,
                recurrent_regularizer=None,
                bias_regularizer=None,
+               activity_regularizer=None,
                kernel_constraint=None,
                recurrent_constraint=None,
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
-               implementation=1,
                **kwargs):
-    super(GRUCell, self).__init__(**kwargs)
+    super(GRU, self).__init__(
+        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
     self.units = units
     self.activation = activations.get(activation)
     self.recurrent_activation = activations.get(recurrent_activation)
@@ -1246,15 +737,22 @@ class GRUCell(Layer):
 
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
-    self.implementation = implementation
-    self.state_size = self.units
-    self._dropout_mask = None
-    self._recurrent_dropout_mask = None
+    self.state_spec = InputSpec(shape=(None, self.units))
 
   def build(self, input_shape):
-    input_dim = input_shape[-1]
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    batch_size = input_shape[0] if self.stateful else None
+    self.input_dim = input_shape[2]
+    self.input_spec[0] = InputSpec(shape=(batch_size, None, self.input_dim))
+
+    self.states = [None]
+    if self.stateful:
+      self.reset_states()
+
     self.kernel = self.add_weight(
-        shape=(input_dim, self.units * 3),
+        shape=(self.input_dim, self.units * 3),
         name='kernel',
         initializer=self.kernel_initializer,
         regularizer=self.kernel_regularizer,
@@ -1294,83 +792,89 @@ class GRUCell(Layer):
       self.bias_h = None
     self.built = True
 
-  def _generate_dropout_mask(self, inputs, training=None):
-    if 0 < self.dropout < 1:
-      ones = K.ones_like(K.squeeze(inputs[:, 0:1, :], axis=1))
+  def preprocess_input(self, inputs, training=None):
+    if self.implementation == 0:
+      input_shape = inputs.get_shape().as_list()
+      input_dim = input_shape[2]
+      timesteps = input_shape[1]
+
+      x_z = _time_distributed_dense(
+          inputs,
+          self.kernel_z,
+          self.bias_z,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      x_r = _time_distributed_dense(
+          inputs,
+          self.kernel_r,
+          self.bias_r,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      x_h = _time_distributed_dense(
+          inputs,
+          self.kernel_h,
+          self.bias_h,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      return K.concatenate([x_z, x_r, x_h], axis=2)
+    else:
+      return inputs
+
+  def get_constants(self, inputs, training=None):
+    constants = []
+    if self.implementation != 0 and 0 < self.dropout < 1:
+      input_shape = K.int_shape(inputs)
+      input_dim = input_shape[-1]
+      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
+      ones = K.tile(ones, (1, int(input_dim)))
 
       def dropped_inputs():
         return K.dropout(ones, self.dropout)
 
-      self._dropout_mask = [
+      dp_mask = [
           K.in_train_phase(dropped_inputs, ones, training=training)
           for _ in range(3)
       ]
+      constants.append(dp_mask)
     else:
-      self._dropout_mask = None
+      constants.append([K.cast_to_floatx(1.) for _ in range(3)])
 
-  def _generate_recurrent_dropout_mask(self, inputs, training=None):
     if 0 < self.recurrent_dropout < 1:
       ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
       ones = K.tile(ones, (1, self.units))
 
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
+      def dropped_inputs():  # pylint: disable=function-redefined
+        return K.dropout(ones, self.recurrent_dropout)
 
-      self._recurrent_dropout_mask = [
+      rec_dp_mask = [
           K.in_train_phase(dropped_inputs, ones, training=training)
           for _ in range(3)
       ]
+      constants.append(rec_dp_mask)
     else:
-      self._recurrent_dropout_mask = None
+      constants.append([K.cast_to_floatx(1.) for _ in range(3)])
+    return constants
 
-  def call(self, inputs, states, training=None):
+  def step(self, inputs, states):
     h_tm1 = states[0]  # previous memory
+    dp_mask = states[1]  # dropout matrices for recurrent units
+    rec_dp_mask = states[2]
 
-    # dropout matrices for input units
-    dp_mask = self._dropout_mask
-    # dropout matrices for recurrent units
-    rec_dp_mask = self._recurrent_dropout_mask
-
-    if self.implementation == 1:
-      if 0. < self.dropout < 1.:
-        inputs_z = inputs * dp_mask[0]
-        inputs_r = inputs * dp_mask[1]
-        inputs_h = inputs * dp_mask[2]
-      else:
-        inputs_z = inputs
-        inputs_r = inputs
-        inputs_h = inputs
-      x_z = K.dot(inputs_z, self.kernel_z)
-      x_r = K.dot(inputs_r, self.kernel_r)
-      x_h = K.dot(inputs_h, self.kernel_h)
-      if self.use_bias:
-        x_z = K.bias_add(x_z, self.bias_z)
-        x_r = K.bias_add(x_r, self.bias_r)
-        x_h = K.bias_add(x_h, self.bias_h)
-
-      if 0. < self.recurrent_dropout < 1.:
-        h_tm1_z = h_tm1 * rec_dp_mask[0]
-        h_tm1_r = h_tm1 * rec_dp_mask[1]
-        h_tm1_h = h_tm1 * rec_dp_mask[2]
-      else:
-        h_tm1_z = h_tm1
-        h_tm1_r = h_tm1
-        h_tm1_h = h_tm1
-      z = self.recurrent_activation(
-          x_z + K.dot(h_tm1_z, self.recurrent_kernel_z))
-      r = self.recurrent_activation(
-          x_r + K.dot(h_tm1_r, self.recurrent_kernel_r))
-
-      hh = self.activation(x_h + K.dot(r * h_tm1_h, self.recurrent_kernel_h))
-    else:
-      if 0. < self.dropout < 1.:
-        inputs *= dp_mask[0]
-      matrix_x = K.dot(inputs, self.kernel)
+    if self.implementation == 2:
+      matrix_x = K.dot(inputs * dp_mask[0], self.kernel)
       if self.use_bias:
         matrix_x = K.bias_add(matrix_x, self.bias)
-      if 0. < self.recurrent_dropout < 1.:
-        h_tm1 *= rec_dp_mask[0]
-      matrix_inner = K.dot(h_tm1, self.recurrent_kernel[:, :2 * self.units])
+      matrix_inner = K.dot(h_tm1 * rec_dp_mask[0],
+                           self.recurrent_kernel[:, :2 * self.units])
 
       x_z = matrix_x[:, :self.units]
       x_r = matrix_x[:, self.units:2 * self.units]
@@ -1381,323 +885,116 @@ class GRUCell(Layer):
       r = self.recurrent_activation(x_r + recurrent_r)
 
       x_h = matrix_x[:, 2 * self.units:]
-      recurrent_h = K.dot(r * h_tm1, self.recurrent_kernel[:, 2 * self.units:])
+      recurrent_h = K.dot(r * h_tm1 * rec_dp_mask[0],
+                          self.recurrent_kernel[:, 2 * self.units:])
       hh = self.activation(x_h + recurrent_h)
+    else:
+      if self.implementation == 0:
+        x_z = inputs[:, :self.units]
+        x_r = inputs[:, self.units:2 * self.units]
+        x_h = inputs[:, 2 * self.units:]
+      elif self.implementation == 1:
+        x_z = K.dot(inputs * dp_mask[0], self.kernel_z)
+        x_r = K.dot(inputs * dp_mask[1], self.kernel_r)
+        x_h = K.dot(inputs * dp_mask[2], self.kernel_h)
+        if self.use_bias:
+          x_z = K.bias_add(x_z, self.bias_z)
+          x_r = K.bias_add(x_r, self.bias_r)
+          x_h = K.bias_add(x_h, self.bias_h)
+      else:
+        raise ValueError('Unknown `implementation` mode.')
+      z = self.recurrent_activation(x_z + K.dot(h_tm1 * rec_dp_mask[0],
+                                                self.recurrent_kernel_z))
+      r = self.recurrent_activation(x_r + K.dot(h_tm1 * rec_dp_mask[1],
+                                                self.recurrent_kernel_r))
+
+      hh = self.activation(x_h + K.dot(r * h_tm1 * rec_dp_mask[2],
+                                       self.recurrent_kernel_h))
     h = z * h_tm1 + (1 - z) * hh
     if 0 < self.dropout + self.recurrent_dropout:
-      if training is None:
-        h._uses_learning_phase = True
+      h._uses_learning_phase = True
     return h, [h]
 
-
-class GRU(RNN):
-  # pylint: disable=line-too-long
-  """Gated Recurrent Unit - Cho et al.
-
-  2014.
-
-  Arguments:
-      units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
-          If you pass None, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      recurrent_activation: Activation function to use
-          for the recurrent step
-          (see [activations](../activations.md)).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
-      recurrent_initializer: Initializer for the `recurrent_kernel`
-          weights matrix,
-          used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
-      kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation").
-          (see [regularizer](../regularizers.md)).
-      kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
-      dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the inputs.
-      recurrent_dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the recurrent state.
-      implementation: Implementation mode, either 1 or 2.
-          Mode 1 will structure its operations as a larger number of
-          smaller dot products and additions, whereas mode 2 will
-          batch them into fewer, larger operations. These modes will
-          have different performance profiles on different hardware and
-          for different applications.
-      return_sequences: Boolean. Whether to return the last output.
-          in the output sequence, or the full sequence.
-      return_state: Boolean. Whether to return the last state
-          in addition to the output.
-      go_backwards: Boolean (default False).
-          If True, process the input sequence backwards and return the
-          reversed sequence.
-      stateful: Boolean (default False). If True, the last state
-          for each sample at index i in a batch will be used as initial
-          state for the sample of index i in the following batch.
-      unroll: Boolean (default False).
-          If True, the network will be unrolled,
-          else a symbolic loop will be used.
-          Unrolling can speed-up a RNN,
-          although it tends to be more memory-intensive.
-          Unrolling is only suitable for short sequences.
-
-  References:
-      - [On the Properties of Neural Machine Translation: Encoder-Decoder Approaches](https://arxiv.org/abs/1409.1259)
-      - [Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling](http://arxiv.org/abs/1412.3555v1)
-      - [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](http://arxiv.org/abs/1512.05287)
-  """
-  # pylint: enable=line-too-long
-
-  def __init__(self,
-               units,
-               activation='tanh',
-               recurrent_activation='hard_sigmoid',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               dropout=0.,
-               recurrent_dropout=0.,
-               implementation=1,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               unroll=False,
-               **kwargs):
-    if implementation == 0:
-      logging.warning('`implementation=0` has been deprecated, '
-                      'and now defaults to `implementation=1`.'
-                      'Please update your layer call.')
-    cell = GRUCell(
-        units,
-        activation=activation,
-        recurrent_activation=recurrent_activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        recurrent_regularizer=recurrent_regularizer,
-        bias_regularizer=bias_regularizer,
-        kernel_constraint=kernel_constraint,
-        recurrent_constraint=recurrent_constraint,
-        bias_constraint=bias_constraint,
-        dropout=dropout,
-        recurrent_dropout=recurrent_dropout,
-        implementation=implementation)
-    super(GRU, self).__init__(
-        cell,
-        return_sequences=return_sequences,
-        return_state=return_state,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        unroll=unroll,
-        **kwargs)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
-
-  def call(self, inputs, mask=None, training=None, initial_state=None):
-    self.cell._generate_dropout_mask(inputs, training=training)
-    self.cell._generate_recurrent_dropout_mask(inputs, training=training)
-    return super(GRU, self).call(
-        inputs, mask=mask, training=training, initial_state=initial_state)
-
-  @property
-  def units(self):
-    return self.cell.units
-
-  @property
-  def activation(self):
-    return self.cell.activation
-
-  @property
-  def recurrent_activation(self):
-    return self.cell.recurrent_activation
-
-  @property
-  def use_bias(self):
-    return self.cell.use_bias
-
-  @property
-  def kernel_initializer(self):
-    return self.cell.kernel_initializer
-
-  @property
-  def recurrent_initializer(self):
-    return self.cell.recurrent_initializer
-
-  @property
-  def bias_initializer(self):
-    return self.cell.bias_initializer
-
-  @property
-  def kernel_regularizer(self):
-    return self.cell.kernel_regularizer
-
-  @property
-  def recurrent_regularizer(self):
-    return self.cell.recurrent_regularizer
-
-  @property
-  def bias_regularizer(self):
-    return self.cell.bias_regularizer
-
-  @property
-  def kernel_constraint(self):
-    return self.cell.kernel_constraint
-
-  @property
-  def recurrent_constraint(self):
-    return self.cell.recurrent_constraint
-
-  @property
-  def bias_constraint(self):
-    return self.cell.bias_constraint
-
-  @property
-  def dropout(self):
-    return self.cell.dropout
-
-  @property
-  def recurrent_dropout(self):
-    return self.cell.recurrent_dropout
-
-  @property
-  def implementation(self):
-    return self.cell.implementation
-
   def get_config(self):
     config = {
-        'units':
-            self.units,
-        'activation':
-            activations.serialize(self.activation),
+        'units': self.units,
+        'activation': activations.serialize(self.activation),
         'recurrent_activation':
             activations.serialize(self.recurrent_activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
         'recurrent_initializer':
             initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
         'recurrent_regularizer':
             regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
         'recurrent_constraint':
             constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout,
-        'implementation':
-            self.implementation
+        'bias_constraint': constraints.serialize(self.bias_constraint),
+        'dropout': self.dropout,
+        'recurrent_dropout': self.recurrent_dropout
     }
     base_config = super(GRU, self).get_config()
-    del base_config['cell']
     return dict(list(base_config.items()) + list(config.items()))
 
-  @classmethod
-  def from_config(cls, config):
-    if 'implementation' in config and config['implementation'] == 0:
-      config['implementation'] = 1
-    return cls(**config)
 
+class LSTM(Recurrent):
+  """Long-Short Term Memory unit - Hochreiter 1997.
 
-class LSTMCell(Layer):
-  """Cell class for the LSTM layer.
+  For a step-by-step description of the algorithm, see
+  [this tutorial](http://deeplearning.net/tutorial/lstm.html).
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
+      activation: Activation function to use.
           If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       recurrent_activation: Activation function to use
-          for the recurrent step
-          (see [activations](../activations.md)).
+          for the recurrent step.
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
+          used for the linear transformation of the inputs..
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
-          used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+          used for the linear transformation of the recurrent state..
+      bias_initializer: Initializer for the bias vector.
       unit_forget_bias: Boolean.
           If True, add 1 to the bias of the forget gate at initialization.
           Setting it to true will also force `bias_initializer="zeros"`.
           This is recommended in [Jozefowicz et
             al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
       recurrent_dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the recurrent state.
-      implementation: Implementation mode, either 1 or 2.
-          Mode 1 will structure its operations as a larger number of
-          smaller dot products and additions, whereas mode 2 will
-          batch them into fewer, larger operations. These modes will
-          have different performance profiles on different hardware and
-          for different applications.
+
+  References:
+      - [Long short-term
+        memory]((http://www.bioinf.jku.at/publications/older/2604.pdf)
+        (original 1997 paper)
+      - [Supervised sequence labeling with recurrent neural
+        networks](http://www.cs.toronto.edu/~graves/preprint.pdf)
+      - [A Theoretically Grounded Application of Dropout in Recurrent Neural
+        Networks](http://arxiv.org/abs/1512.05287)
   """
 
   def __init__(self,
@@ -1712,14 +1009,15 @@ class LSTMCell(Layer):
                kernel_regularizer=None,
                recurrent_regularizer=None,
                bias_regularizer=None,
+               activity_regularizer=None,
                kernel_constraint=None,
                recurrent_constraint=None,
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
-               implementation=1,
                **kwargs):
-    super(LSTMCell, self).__init__(**kwargs)
+    super(LSTM, self).__init__(
+        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
     self.units = units
     self.activation = activations.get(activation)
     self.recurrent_activation = activations.get(recurrent_activation)
@@ -1740,15 +1038,25 @@ class LSTMCell(Layer):
 
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
-    self.implementation = implementation
-    self.state_size = (self.units, self.units)
-    self._dropout_mask = None
-    self._recurrent_dropout_mask = None
+    self.state_spec = [
+        InputSpec(shape=(None, self.units)),
+        InputSpec(shape=(None, self.units))
+    ]
 
   def build(self, input_shape):
-    input_dim = input_shape[-1]
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    batch_size = input_shape[0] if self.stateful else None
+    self.input_dim = input_shape[2]
+    self.input_spec[0] = InputSpec(shape=(batch_size, None, self.input_dim))
+
+    self.states = [None, None]
+    if self.stateful:
+      self.reset_states()
+
     self.kernel = self.add_weight(
-        shape=(input_dim, self.units * 4),
+        shape=(self.input_dim, self.units * 4),
         name='kernel',
         initializer=self.kernel_initializer,
         regularizer=self.kernel_regularizer,
@@ -1804,90 +1112,96 @@ class LSTMCell(Layer):
       self.bias_o = None
     self.built = True
 
-  def _generate_dropout_mask(self, inputs, training=None):
-    if 0 < self.dropout < 1:
-      ones = K.ones_like(K.squeeze(inputs[:, 0:1, :], axis=1))
+  def preprocess_input(self, inputs, training=None):
+    if self.implementation == 0:
+      input_shape = inputs.get_shape().as_list()
+      input_dim = input_shape[2]
+      timesteps = input_shape[1]
+
+      x_i = _time_distributed_dense(
+          inputs,
+          self.kernel_i,
+          self.bias_i,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      x_f = _time_distributed_dense(
+          inputs,
+          self.kernel_f,
+          self.bias_f,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      x_c = _time_distributed_dense(
+          inputs,
+          self.kernel_c,
+          self.bias_c,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      x_o = _time_distributed_dense(
+          inputs,
+          self.kernel_o,
+          self.bias_o,
+          self.dropout,
+          input_dim,
+          self.units,
+          timesteps,
+          training=training)
+      return K.concatenate([x_i, x_f, x_c, x_o], axis=2)
+    else:
+      return inputs
+
+  def get_constants(self, inputs, training=None):
+    constants = []
+    if self.implementation != 0 and 0 < self.dropout < 1:
+      input_shape = K.int_shape(inputs)
+      input_dim = input_shape[-1]
+      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
+      ones = K.tile(ones, (1, int(input_dim)))
 
       def dropped_inputs():
         return K.dropout(ones, self.dropout)
 
-      self._dropout_mask = [
+      dp_mask = [
           K.in_train_phase(dropped_inputs, ones, training=training)
           for _ in range(4)
       ]
+      constants.append(dp_mask)
     else:
-      self._dropout_mask = None
+      constants.append([K.cast_to_floatx(1.) for _ in range(4)])
 
-  def _generate_recurrent_dropout_mask(self, inputs, training=None):
     if 0 < self.recurrent_dropout < 1:
       ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
       ones = K.tile(ones, (1, self.units))
 
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
+      def dropped_inputs():  # pylint: disable=function-redefined
+        return K.dropout(ones, self.recurrent_dropout)
 
-      self._recurrent_dropout_mask = [
+      rec_dp_mask = [
           K.in_train_phase(dropped_inputs, ones, training=training)
           for _ in range(4)
       ]
+      constants.append(rec_dp_mask)
     else:
-      self._recurrent_dropout_mask = None
-
-  def call(self, inputs, states, training=None):
-    # dropout matrices for input units
-    dp_mask = self._dropout_mask
-    # dropout matrices for recurrent units
-    rec_dp_mask = self._recurrent_dropout_mask
-
-    h_tm1 = states[0]  # previous memory state
-    c_tm1 = states[1]  # previous carry state
-
-    if self.implementation == 1:
-      if 0 < self.dropout < 1.:
-        inputs_i = inputs * dp_mask[0]
-        inputs_f = inputs * dp_mask[1]
-        inputs_c = inputs * dp_mask[2]
-        inputs_o = inputs * dp_mask[3]
-      else:
-        inputs_i = inputs
-        inputs_f = inputs
-        inputs_c = inputs
-        inputs_o = inputs
-      x_i = K.dot(inputs_i, self.kernel_i)
-      x_f = K.dot(inputs_f, self.kernel_f)
-      x_c = K.dot(inputs_c, self.kernel_c)
-      x_o = K.dot(inputs_o, self.kernel_o)
-      if self.use_bias:
-        x_i = K.bias_add(x_i, self.bias_i)
-        x_f = K.bias_add(x_f, self.bias_f)
-        x_c = K.bias_add(x_c, self.bias_c)
-        x_o = K.bias_add(x_o, self.bias_o)
-
-      if 0 < self.recurrent_dropout < 1.:
-        h_tm1_i = h_tm1 * rec_dp_mask[0]
-        h_tm1_f = h_tm1 * rec_dp_mask[1]
-        h_tm1_c = h_tm1 * rec_dp_mask[2]
-        h_tm1_o = h_tm1 * rec_dp_mask[3]
-      else:
-        h_tm1_i = h_tm1
-        h_tm1_f = h_tm1
-        h_tm1_c = h_tm1
-        h_tm1_o = h_tm1
-      i = self.recurrent_activation(
-          x_i + K.dot(h_tm1_i, self.recurrent_kernel_i))
-      f = self.recurrent_activation(
-          x_f + K.dot(h_tm1_f, self.recurrent_kernel_f))
-      c = f * c_tm1 + i * self.activation(
-          x_c + K.dot(h_tm1_c, self.recurrent_kernel_c))
-      o = self.recurrent_activation(
-          x_o + K.dot(h_tm1_o, self.recurrent_kernel_o))
-    else:
-      if 0. < self.dropout < 1.:
-        inputs *= dp_mask[0]
-      z = K.dot(inputs, self.kernel)
-      if 0. < self.recurrent_dropout < 1.:
-        h_tm1 *= rec_dp_mask[0]
-      z += K.dot(h_tm1, self.recurrent_kernel)
+      constants.append([K.cast_to_floatx(1.) for _ in range(4)])
+    return constants
+
+  def step(self, inputs, states):
+    h_tm1 = states[0]
+    c_tm1 = states[1]
+    dp_mask = states[2]
+    rec_dp_mask = states[3]
+
+    if self.implementation == 2:
+      z = K.dot(inputs * dp_mask[0], self.kernel)
+      z += K.dot(h_tm1 * rec_dp_mask[0], self.recurrent_kernel)
       if self.use_bias:
         z = K.bias_add(z, self.bias)
 
@@ -1900,606 +1214,57 @@ class LSTMCell(Layer):
       f = self.recurrent_activation(z1)
       c = f * c_tm1 + i * self.activation(z2)
       o = self.recurrent_activation(z3)
+    else:
+      if self.implementation == 0:
+        x_i = inputs[:, :self.units]
+        x_f = inputs[:, self.units:2 * self.units]
+        x_c = inputs[:, 2 * self.units:3 * self.units]
+        x_o = inputs[:, 3 * self.units:]
+      elif self.implementation == 1:
+        x_i = K.dot(inputs * dp_mask[0], self.kernel_i) + self.bias_i
+        x_f = K.dot(inputs * dp_mask[1], self.kernel_f) + self.bias_f
+        x_c = K.dot(inputs * dp_mask[2], self.kernel_c) + self.bias_c
+        x_o = K.dot(inputs * dp_mask[3], self.kernel_o) + self.bias_o
+      else:
+        raise ValueError('Unknown `implementation` mode.')
 
+      i = self.recurrent_activation(x_i + K.dot(h_tm1 * rec_dp_mask[0],
+                                                self.recurrent_kernel_i))
+      f = self.recurrent_activation(x_f + K.dot(h_tm1 * rec_dp_mask[1],
+                                                self.recurrent_kernel_f))
+      c = f * c_tm1 + i * self.activation(
+          x_c + K.dot(h_tm1 * rec_dp_mask[2], self.recurrent_kernel_c))
+      o = self.recurrent_activation(x_o + K.dot(h_tm1 * rec_dp_mask[3],
+                                                self.recurrent_kernel_o))
     h = o * self.activation(c)
     if 0 < self.dropout + self.recurrent_dropout:
-      if training is None:
-        h._uses_learning_phase = True
+      h._uses_learning_phase = True
     return h, [h, c]
 
-
-class LSTM(RNN):
-  # pylint: disable=line-too-long
-  """Long-Short Term Memory layer - Hochreiter 1997.
-
-  Arguments:
-      units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
-          If you pass None, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      recurrent_activation: Activation function to use
-          for the recurrent step
-          (see [activations](../activations.md)).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
-      recurrent_initializer: Initializer for the `recurrent_kernel`
-          weights matrix,
-          used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
-      unit_forget_bias: Boolean.
-          If True, add 1 to the bias of the forget gate at initialization.
-          Setting it to true will also force `bias_initializer="zeros"`.
-          This is recommended in [Jozefowicz et
-            al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
-      kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation").
-          (see [regularizer](../regularizers.md)).
-      kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
-      dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the inputs.
-      recurrent_dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the recurrent state.
-      implementation: Implementation mode, either 1 or 2.
-          Mode 1 will structure its operations as a larger number of
-          smaller dot products and additions, whereas mode 2 will
-          batch them into fewer, larger operations. These modes will
-          have different performance profiles on different hardware and
-          for different applications.
-      return_sequences: Boolean. Whether to return the last output.
-          in the output sequence, or the full sequence.
-      return_state: Boolean. Whether to return the last state
-          in addition to the output.
-      go_backwards: Boolean (default False).
-          If True, process the input sequence backwards and return the
-          reversed sequence.
-      stateful: Boolean (default False). If True, the last state
-          for each sample at index i in a batch will be used as initial
-          state for the sample of index i in the following batch.
-      unroll: Boolean (default False).
-          If True, the network will be unrolled,
-          else a symbolic loop will be used.
-          Unrolling can speed-up a RNN,
-          although it tends to be more memory-intensive.
-          Unrolling is only suitable for short sequences.
-
-  References:
-      - [Long short-term memory](http://www.bioinf.jku.at/publications/older/2604.pdf)
-      - [Learning to forget: Continual prediction with LSTM](http://www.mitpressjournals.org/doi/pdf/10.1162/089976600300015015)
-      - [Supervised sequence labeling with recurrent neural networks](http://www.cs.toronto.edu/~graves/preprint.pdf)
-      - [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](http://arxiv.org/abs/1512.05287)
-  """
-  # pylint: enable=line-too-long
-
-  def __init__(self,
-               units,
-               activation='tanh',
-               recurrent_activation='hard_sigmoid',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               unit_forget_bias=True,
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               dropout=0.,
-               recurrent_dropout=0.,
-               implementation=1,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               unroll=False,
-               **kwargs):
-    if implementation == 0:
-      logging.warning('`implementation=0` has been deprecated, '
-                      'and now defaults to `implementation=1`.'
-                      'Please update your layer call.')
-    cell = LSTMCell(
-        units,
-        activation=activation,
-        recurrent_activation=recurrent_activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        unit_forget_bias=unit_forget_bias,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        recurrent_regularizer=recurrent_regularizer,
-        bias_regularizer=bias_regularizer,
-        kernel_constraint=kernel_constraint,
-        recurrent_constraint=recurrent_constraint,
-        bias_constraint=bias_constraint,
-        dropout=dropout,
-        recurrent_dropout=recurrent_dropout,
-        implementation=implementation)
-    super(LSTM, self).__init__(
-        cell,
-        return_sequences=return_sequences,
-        return_state=return_state,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        unroll=unroll,
-        **kwargs)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
-
-  def call(self, inputs, mask=None, training=None, initial_state=None):
-    self.cell._generate_dropout_mask(inputs, training=training)
-    self.cell._generate_recurrent_dropout_mask(inputs, training=training)
-    return super(LSTM, self).call(
-        inputs, mask=mask, training=training, initial_state=initial_state)
-
-  @property
-  def units(self):
-    return self.cell.units
-
-  @property
-  def activation(self):
-    return self.cell.activation
-
-  @property
-  def recurrent_activation(self):
-    return self.cell.recurrent_activation
-
-  @property
-  def use_bias(self):
-    return self.cell.use_bias
-
-  @property
-  def kernel_initializer(self):
-    return self.cell.kernel_initializer
-
-  @property
-  def recurrent_initializer(self):
-    return self.cell.recurrent_initializer
-
-  @property
-  def bias_initializer(self):
-    return self.cell.bias_initializer
-
-  @property
-  def unit_forget_bias(self):
-    return self.cell.unit_forget_bias
-
-  @property
-  def kernel_regularizer(self):
-    return self.cell.kernel_regularizer
-
-  @property
-  def recurrent_regularizer(self):
-    return self.cell.recurrent_regularizer
-
-  @property
-  def bias_regularizer(self):
-    return self.cell.bias_regularizer
-
-  @property
-  def kernel_constraint(self):
-    return self.cell.kernel_constraint
-
-  @property
-  def recurrent_constraint(self):
-    return self.cell.recurrent_constraint
-
-  @property
-  def bias_constraint(self):
-    return self.cell.bias_constraint
-
-  @property
-  def dropout(self):
-    return self.cell.dropout
-
-  @property
-  def recurrent_dropout(self):
-    return self.cell.recurrent_dropout
-
-  @property
-  def implementation(self):
-    return self.cell.implementation
-
   def get_config(self):
     config = {
-        'units':
-            self.units,
-        'activation':
-            activations.serialize(self.activation),
+        'units': self.units,
+        'activation': activations.serialize(self.activation),
         'recurrent_activation':
             activations.serialize(self.recurrent_activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
         'recurrent_initializer':
             initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'unit_forget_bias':
-            self.unit_forget_bias,
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'unit_forget_bias': self.unit_forget_bias,
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
         'recurrent_regularizer':
             regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
         'recurrent_constraint':
             constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout,
-        'implementation':
-            self.implementation
+        'bias_constraint': constraints.serialize(self.bias_constraint),
+        'dropout': self.dropout,
+        'recurrent_dropout': self.recurrent_dropout
     }
     base_config = super(LSTM, self).get_config()
-    del base_config['cell']
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config):
-    if 'implementation' in config and config['implementation'] == 0:
-      config['implementation'] = 1
-    return cls(**config)
-
-
-class Recurrent(Layer):
-  """Deprecated abstract base class for recurrent layers.
-
-  It still exists because it is leveraged by the convolutional-recurrent layers.
-  It will be removed entirely in the future.
-  It was never part of the public API.
-  Do not use.
-
-  Arguments:
-      weights: list of Numpy arrays to set as initial weights.
-          The list should have 3 elements, of shapes:
-          `[(input_dim, output_dim), (output_dim, output_dim), (output_dim,)]`.
-      return_sequences: Boolean. Whether to return the last output
-          in the output sequence, or the full sequence.
-      return_state: Boolean. Whether to return the last state
-          in addition to the output.
-      go_backwards: Boolean (default False).
-          If True, process the input sequence backwards and return the
-          reversed sequence.
-      stateful: Boolean (default False). If True, the last state
-          for each sample at index i in a batch will be used as initial
-          state for the sample of index i in the following batch.
-      unroll: Boolean (default False).
-          If True, the network will be unrolled,
-          else a symbolic loop will be used.
-          Unrolling can speed-up a RNN,
-          although it tends to be more memory-intensive.
-          Unrolling is only suitable for short sequences.
-      implementation: one of {0, 1, or 2}.
-          If set to 0, the RNN will use
-          an implementation that uses fewer, larger matrix products,
-          thus running faster on CPU but consuming more memory.
-          If set to 1, the RNN will use more matrix products,
-          but smaller ones, thus running slower
-          (may actually be faster on GPU) while consuming less memory.
-          If set to 2 (LSTM/GRU only),
-          the RNN will combine the input gate,
-          the forget gate and the output gate into a single matrix,
-          enabling more time-efficient parallelization on the GPU.
-          Note: RNN dropout must be shared for all gates,
-          resulting in a slightly reduced regularization.
-      input_dim: dimensionality of the input (integer).
-          This argument (or alternatively, the keyword argument `input_shape`)
-          is required when using this layer as the first layer in a model.
-      input_length: Length of input sequences, to be specified
-          when it is constant.
-          This argument is required if you are going to connect
-          `Flatten` then `Dense` layers upstream
-          (without it, the shape of the dense outputs cannot be computed).
-          Note that if the recurrent layer is not the first layer
-          in your model, you would need to specify the input length
-          at the level of the first layer
-          (e.g. via the `input_shape` argument)
-
-  Input shape:
-      3D tensor with shape `(batch_size, timesteps, input_dim)`,
-      (Optional) 2D tensors with shape `(batch_size, output_dim)`.
-
-  Output shape:
-      - if `return_state`: a list of tensors. The first tensor is
-          the output. The remaining tensors are the last states,
-          each with shape `(batch_size, units)`.
-      - if `return_sequences`: 3D tensor with shape
-          `(batch_size, timesteps, units)`.
-      - else, 2D tensor with shape `(batch_size, units)`.
-
-  # Masking
-      This layer supports masking for input data with a variable number
-      of timesteps. To introduce masks to your data,
-      use an `Embedding` layer with the `mask_zero` parameter
-      set to `True`.
-
-  # Note on using statefulness in RNNs
-      You can set RNN layers to be 'stateful', which means that the states
-      computed for the samples in one batch will be reused as initial states
-      for the samples in the next batch. This assumes a one-to-one mapping
-      between samples in different successive batches.
-
-      To enable statefulness:
-          - specify `stateful=True` in the layer constructor.
-          - specify a fixed batch size for your model, by passing
-              if sequential model:
-                `batch_input_shape=(...)` to the first layer in your model.
-              else for functional model with 1 or more Input layers:
-                `batch_shape=(...)` to all the first layers in your model.
-              This is the expected shape of your inputs
-              *including the batch size*.
-              It should be a tuple of integers, e.g. `(32, 10, 100)`.
-          - specify `shuffle=False` when calling fit().
-
-      To reset the states of your model, call `.reset_states()` on either
-      a specific layer, or on your entire model.
-
-  # Note on specifying the initial state of RNNs
-      You can specify the initial state of RNN layers symbolically by
-      calling them with the keyword argument `initial_state`. The value of
-      `initial_state` should be a tensor or list of tensors representing
-      the initial state of the RNN layer.
-
-      You can specify the initial state of RNN layers numerically by
-      calling `reset_states` with the keyword argument `states`. The value of
-      `states` should be a numpy array or list of numpy arrays representing
-      the initial state of the RNN layer.
-  """
-
-  def __init__(self,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               unroll=False,
-               implementation=0,
-               **kwargs):
-    super(Recurrent, self).__init__(**kwargs)
-    self.return_sequences = return_sequences
-    self.return_state = return_state
-    self.go_backwards = go_backwards
-    self.stateful = stateful
-    self.unroll = unroll
-    self.implementation = implementation
-    self.supports_masking = True
-    self.input_spec = [InputSpec(ndim=3)]
-    self.state_spec = None
-    self.dropout = 0
-    self.recurrent_dropout = 0
-
-  def _compute_output_shape(self, input_shape):
-    if isinstance(input_shape, list):
-      input_shape = input_shape[0]
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if self.return_sequences:
-      output_shape = (input_shape[0], input_shape[1], self.units)
-    else:
-      output_shape = (input_shape[0], self.units)
-
-    if self.return_state:
-      state_shape = [tensor_shape.TensorShape(
-          (input_shape[0], self.units)) for _ in self.states]
-      return [tensor_shape.TensorShape(output_shape)] + state_shape
-    return tensor_shape.TensorShape(output_shape)
-
-  def compute_mask(self, inputs, mask):
-    if isinstance(mask, list):
-      mask = mask[0]
-    output_mask = mask if self.return_sequences else None
-    if self.return_state:
-      state_mask = [None for _ in self.states]
-      return [output_mask] + state_mask
-    return output_mask
-
-  def step(self, inputs, states):
-    raise NotImplementedError
-
-  def get_constants(self, inputs, training=None):
-    return []
-
-  def get_initial_state(self, inputs):
-    # build an all-zero tensor of shape (samples, output_dim)
-    initial_state = K.zeros_like(inputs)  # (samples, timesteps, input_dim)
-    initial_state = K.sum(initial_state, axis=(1, 2))  # (samples,)
-    initial_state = K.expand_dims(initial_state)  # (samples, 1)
-    initial_state = K.tile(initial_state, [1,
-                                           self.units])  # (samples, output_dim)
-    initial_state = [initial_state for _ in range(len(self.states))]
-    return initial_state
-
-  def preprocess_input(self, inputs, training=None):
-    return inputs
-
-  def __call__(self, inputs, initial_state=None, **kwargs):
-    if (isinstance(inputs, (list, tuple)) and
-        len(inputs) > 1
-        and initial_state is None):
-      initial_state = inputs[1:]
-      inputs = inputs[0]
-
-    # If `initial_state` is specified,
-    # and if it a Keras tensor,
-    # then add it to the inputs and temporarily
-    # modify the input spec to include the state.
-    if initial_state is None:
-      return super(Recurrent, self).__call__(inputs, **kwargs)
-
-    if not isinstance(initial_state, (list, tuple)):
-      initial_state = [initial_state]
-
-    is_keras_tensor = hasattr(initial_state[0], '_keras_history')
-    for tensor in initial_state:
-      if hasattr(tensor, '_keras_history') != is_keras_tensor:
-        raise ValueError('The initial state of an RNN layer cannot be'
-                         ' specified with a mix of Keras tensors and'
-                         ' non-Keras tensors')
-
-    if is_keras_tensor:
-      # Compute the full input spec, including state
-      input_spec = self.input_spec
-      state_spec = self.state_spec
-      if not isinstance(input_spec, list):
-        input_spec = [input_spec]
-      if not isinstance(state_spec, list):
-        state_spec = [state_spec]
-      self.input_spec = input_spec + state_spec
-
-      # Compute the full inputs, including state
-      inputs = [inputs] + list(initial_state)
-
-      # Perform the call
-      output = super(Recurrent, self).__call__(inputs, **kwargs)
-
-      # Restore original input spec
-      self.input_spec = input_spec
-      return output
-    else:
-      kwargs['initial_state'] = initial_state
-      return super(Recurrent, self).__call__(inputs, **kwargs)
-
-  def call(self, inputs, mask=None, training=None, initial_state=None):
-    # input shape: `(samples, time (padded with zeros), input_dim)`
-    # note that the .build() method of subclasses MUST define
-    # self.input_spec and self.state_spec with complete input shapes.
-    if isinstance(inputs, list):
-      initial_state = inputs[1:]
-      inputs = inputs[0]
-    elif initial_state is not None:
-      pass
-    elif self.stateful:
-      initial_state = self.states
-    else:
-      initial_state = self.get_initial_state(inputs)
-
-    if isinstance(mask, list):
-      mask = mask[0]
-
-    if len(initial_state) != len(self.states):
-      raise ValueError('Layer has ' + str(len(self.states)) +
-                       ' states but was passed ' + str(len(initial_state)) +
-                       ' initial states.')
-    input_shape = K.int_shape(inputs)
-    if self.unroll and input_shape[1] is None:
-      raise ValueError('Cannot unroll a RNN if the '
-                       'time dimension is undefined. \n'
-                       '- If using a Sequential model, '
-                       'specify the time dimension by passing '
-                       'an `input_shape` or `batch_input_shape` '
-                       'argument to your first layer. If your '
-                       'first layer is an Embedding, you can '
-                       'also use the `input_length` argument.\n'
-                       '- If using the functional API, specify '
-                       'the time dimension by passing a `shape` '
-                       'or `batch_shape` argument to your Input layer.')
-    constants = self.get_constants(inputs, training=None)
-    preprocessed_input = self.preprocess_input(inputs, training=None)
-    last_output, outputs, states = K.rnn(
-        self.step,
-        preprocessed_input,
-        initial_state,
-        go_backwards=self.go_backwards,
-        mask=mask,
-        constants=constants,
-        unroll=self.unroll)
-    if self.stateful:
-      updates = []
-      for i in range(len(states)):
-        updates.append((self.states[i], states[i]))
-      self.add_update(updates, inputs)
-
-    # Properly set learning phase
-    if 0 < self.dropout + self.recurrent_dropout:
-      last_output._uses_learning_phase = True
-      outputs._uses_learning_phase = True
-
-    if not self.return_sequences:
-      outputs = last_output
-
-    if self.return_state:
-      if not isinstance(states, (list, tuple)):
-        states = [states]
-      else:
-        states = list(states)
-      return [outputs] + states
-    return outputs
-
-  def reset_states(self, states=None):
-    if not self.stateful:
-      raise AttributeError('Layer must be stateful.')
-    batch_size = self.input_spec[0].shape[0]
-    if not batch_size:
-      raise ValueError('If a RNN is stateful, it needs to know '
-                       'its batch size. Specify the batch size '
-                       'of your input tensors: \n'
-                       '- If using a Sequential model, '
-                       'specify the batch size by passing '
-                       'a `batch_input_shape` '
-                       'argument to your first layer.\n'
-                       '- If using the functional API, specify '
-                       'the time dimension by passing a '
-                       '`batch_shape` argument to your Input layer.')
-    # initialize state if None
-    if self.states[0] is None:
-      self.states = [K.zeros((batch_size, self.units)) for _ in self.states]
-    elif states is None:
-      for state in self.states:
-        K.set_value(state, np.zeros((batch_size, self.units)))
-    else:
-      if not isinstance(states, (list, tuple)):
-        states = [states]
-      if len(states) != len(self.states):
-        raise ValueError('Layer ' + self.name + ' expects ' +
-                         str(len(self.states)) + ' states, '
-                         'but it received ' + str(len(states)) +
-                         ' state values. Input received: ' + str(states))
-      for index, (value, state) in enumerate(zip(states, self.states)):
-        if value.shape != (batch_size, self.units):
-          raise ValueError('State ' + str(index) +
-                           ' is incompatible with layer ' + self.name +
-                           ': expected shape=' + str((batch_size, self.units)) +
-                           ', found shape=' + str(value.shape))
-        K.set_value(state, value)
-
-  def get_config(self):
-    config = {
-        'return_sequences': self.return_sequences,
-        'return_state': self.return_state,
-        'go_backwards': self.go_backwards,
-        'stateful': self.stateful,
-        'unroll': self.unroll,
-        'implementation': self.implementation
-    }
-    base_config = super(Recurrent, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
deleted file mode 100644
index b1f89a30bb..0000000000
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
+++ /dev/null
@@ -1,378 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for recurrent layers functionality other than GRU, LSTM, SimpleRNN.
-
-See also: lstm_test.py, gru_test.py, simplernn_test.py.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.platform import test
-
-
-class RNNTest(test.TestCase):
-
-  def test_minimal_rnn_cell_non_layer(self):
-
-    class MinimalRNNCell(object):
-
-      def __init__(self, units, input_dim):
-        self.units = units
-        self.state_size = units
-        self.kernel = keras.backend.variable(
-            np.random.random((input_dim, units)))
-
-      def call(self, inputs, states):
-        prev_output = states[0]
-        output = keras.backend.dot(inputs, self.kernel) + prev_output
-        return output, [output]
-
-    with self.test_session():
-      # Basic test case.
-      cell = MinimalRNNCell(32, 5)
-      x = keras.Input((None, 5))
-      layer = keras.layers.RNN(cell)
-      y = layer(x)
-      model = keras.models.Model(x, y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-      # Test stacking.
-      cells = [MinimalRNNCell(8, 5),
-               MinimalRNNCell(32, 8),
-               MinimalRNNCell(32, 32)]
-      layer = keras.layers.RNN(cells)
-      y = layer(x)
-      model = keras.models.Model(x, y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-  def test_minimal_rnn_cell_non_layer_multiple_states(self):
-
-    class MinimalRNNCell(object):
-
-      def __init__(self, units, input_dim):
-        self.units = units
-        self.state_size = (units, units)
-        self.kernel = keras.backend.variable(
-            np.random.random((input_dim, units)))
-
-      def call(self, inputs, states):
-        prev_output_1 = states[0]
-        prev_output_2 = states[1]
-        output = keras.backend.dot(inputs, self.kernel)
-        output += prev_output_1
-        output -= prev_output_2
-        return output, [output * 2, output * 3]
-
-    with self.test_session():
-      # Basic test case.
-      cell = MinimalRNNCell(32, 5)
-      x = keras.Input((None, 5))
-      layer = keras.layers.RNN(cell)
-      y = layer(x)
-      model = keras.models.Model(x, y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-      # Test stacking.
-      cells = [MinimalRNNCell(8, 5),
-               MinimalRNNCell(16, 8),
-               MinimalRNNCell(32, 16)]
-      layer = keras.layers.RNN(cells)
-      assert layer.cell.state_size == (32, 32, 16, 16, 8, 8)
-      y = layer(x)
-      model = keras.models.Model(x, y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-  def test_minimal_rnn_cell_layer(self):
-
-    class MinimalRNNCell(keras.layers.Layer):
-
-      def __init__(self, units, **kwargs):
-        self.units = units
-        self.state_size = units
-        super(MinimalRNNCell, self).__init__(**kwargs)
-
-      def build(self, input_shape):
-        self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
-                                      initializer='uniform',
-                                      name='kernel')
-        self.recurrent_kernel = self.add_weight(
-            shape=(self.units, self.units),
-            initializer='uniform',
-            name='recurrent_kernel')
-        self.built = True
-
-      def call(self, inputs, states):
-        prev_output = states[0]
-        h = keras.backend.dot(inputs, self.kernel)
-        output = h + keras.backend.dot(prev_output, self.recurrent_kernel)
-        return output, [output]
-
-      def get_config(self):
-        config = {'units': self.units}
-        base_config = super(MinimalRNNCell, self).get_config()
-        return dict(list(base_config.items()) + list(config.items()))
-
-    with self.test_session():
-      # Test basic case.
-      x = keras.Input((None, 5))
-      cell = MinimalRNNCell(32)
-      layer = keras.layers.RNN(cell)
-      y = layer(x)
-      model = keras.models.Model(x, y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-      # Test basic case serialization.
-      x_np = np.random.random((6, 5, 5))
-      y_np = model.predict(x_np)
-      weights = model.get_weights()
-      config = layer.get_config()
-      with keras.utils.CustomObjectScope({'MinimalRNNCell': MinimalRNNCell}):
-        layer = keras.layers.RNN.from_config(config)
-      y = layer(x)
-      model = keras.models.Model(x, y)
-      model.set_weights(weights)
-      y_np_2 = model.predict(x_np)
-      self.assertAllClose(y_np, y_np_2, atol=1e-4)
-
-      # Test stacking.
-      cells = [MinimalRNNCell(8),
-               MinimalRNNCell(12),
-               MinimalRNNCell(32)]
-      layer = keras.layers.RNN(cells)
-      y = layer(x)
-      model = keras.models.Model(x, y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-      # Test stacked RNN serialization.
-      x_np = np.random.random((6, 5, 5))
-      y_np = model.predict(x_np)
-      weights = model.get_weights()
-      config = layer.get_config()
-      with keras.utils.CustomObjectScope({'MinimalRNNCell': MinimalRNNCell}):
-        layer = keras.layers.RNN.from_config(config)
-      y = layer(x)
-      model = keras.models.Model(x, y)
-      model.set_weights(weights)
-      y_np_2 = model.predict(x_np)
-      self.assertAllClose(y_np, y_np_2, atol=1e-4)
-
-  def test_rnn_cell_with_constants_layer(self):
-
-    class RNNCellWithConstants(keras.layers.Layer):
-
-      def __init__(self, units, **kwargs):
-        self.units = units
-        self.state_size = units
-        super(RNNCellWithConstants, self).__init__(**kwargs)
-
-      def build(self, input_shape):
-        if not isinstance(input_shape, list):
-          raise TypeError('expects constants shape')
-        [input_shape, constant_shape] = input_shape
-        # will (and should) raise if more than one constant passed
-
-        self.input_kernel = self.add_weight(
-            shape=(input_shape[-1], self.units),
-            initializer='uniform',
-            name='kernel')
-        self.recurrent_kernel = self.add_weight(
-            shape=(self.units, self.units),
-            initializer='uniform',
-            name='recurrent_kernel')
-        self.constant_kernel = self.add_weight(
-            shape=(constant_shape[-1], self.units),
-            initializer='uniform',
-            name='constant_kernel')
-        self.built = True
-
-      def call(self, inputs, states, constants):
-        [prev_output] = states
-        [constant] = constants
-        h_input = keras.backend.dot(inputs, self.input_kernel)
-        h_state = keras.backend.dot(prev_output, self.recurrent_kernel)
-        h_const = keras.backend.dot(constant, self.constant_kernel)
-        output = h_input + h_state + h_const
-        return output, [output]
-
-      def get_config(self):
-        config = {'units': self.units}
-        base_config = super(RNNCellWithConstants, self).get_config()
-        return dict(list(base_config.items()) + list(config.items()))
-
-    with self.test_session():
-      # Test basic case.
-      x = keras.Input((None, 5))
-      c = keras.Input((3,))
-      cell = RNNCellWithConstants(32)
-      layer = keras.layers.RNN(cell)
-      y = layer(x, constants=c)
-      model = keras.models.Model([x, c], y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(
-          [np.zeros((6, 5, 5)), np.zeros((6, 3))],
-          np.zeros((6, 32))
-      )
-
-    with self.test_session():
-      # Test basic case serialization.
-      x_np = np.random.random((6, 5, 5))
-      c_np = np.random.random((6, 3))
-      y_np = model.predict([x_np, c_np])
-      weights = model.get_weights()
-      config = layer.get_config()
-      custom_objects = {'RNNCellWithConstants': RNNCellWithConstants}
-      with keras.utils.CustomObjectScope(custom_objects):
-        layer = keras.layers.RNN.from_config(config.copy())
-      y = layer(x, constants=c)
-      model = keras.models.Model([x, c], y)
-      model.set_weights(weights)
-      y_np_2 = model.predict([x_np, c_np])
-      self.assertAllClose(y_np, y_np_2, atol=1e-4)
-
-    with self.test_session():
-      # test flat list inputs
-      with keras.utils.CustomObjectScope(custom_objects):
-        layer = keras.layers.RNN.from_config(config.copy())
-      y = layer([x, c])
-      model = keras.models.Model([x, c], y)
-      model.set_weights(weights)
-      y_np_3 = model.predict([x_np, c_np])
-      self.assertAllClose(y_np, y_np_3, atol=1e-4)
-
-  def test_rnn_cell_with_constants_layer_passing_initial_state(self):
-
-    class RNNCellWithConstants(keras.layers.Layer):
-
-      def __init__(self, units, **kwargs):
-        self.units = units
-        self.state_size = units
-        super(RNNCellWithConstants, self).__init__(**kwargs)
-
-      def build(self, input_shape):
-        if not isinstance(input_shape, list):
-          raise TypeError('expects constants shape')
-        [input_shape, constant_shape] = input_shape
-        # will (and should) raise if more than one constant passed
-
-        self.input_kernel = self.add_weight(
-            shape=(input_shape[-1], self.units),
-            initializer='uniform',
-            name='kernel')
-        self.recurrent_kernel = self.add_weight(
-            shape=(self.units, self.units),
-            initializer='uniform',
-            name='recurrent_kernel')
-        self.constant_kernel = self.add_weight(
-            shape=(constant_shape[-1], self.units),
-            initializer='uniform',
-            name='constant_kernel')
-        self.built = True
-
-      def call(self, inputs, states, constants):
-        [prev_output] = states
-        [constant] = constants
-        h_input = keras.backend.dot(inputs, self.input_kernel)
-        h_state = keras.backend.dot(prev_output, self.recurrent_kernel)
-        h_const = keras.backend.dot(constant, self.constant_kernel)
-        output = h_input + h_state + h_const
-        return output, [output]
-
-      def get_config(self):
-        config = {'units': self.units}
-        base_config = super(RNNCellWithConstants, self).get_config()
-        return dict(list(base_config.items()) + list(config.items()))
-
-    with self.test_session():
-      # Test basic case.
-      x = keras.Input((None, 5))
-      c = keras.Input((3,))
-      s = keras.Input((32,))
-      cell = RNNCellWithConstants(32)
-      layer = keras.layers.RNN(cell)
-      y = layer(x, initial_state=s, constants=c)
-      model = keras.models.Model([x, s, c], y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(
-          [np.zeros((6, 5, 5)), np.zeros((6, 32)), np.zeros((6, 3))],
-          np.zeros((6, 32))
-      )
-
-    with self.test_session():
-      # Test basic case serialization.
-      x_np = np.random.random((6, 5, 5))
-      s_np = np.random.random((6, 32))
-      c_np = np.random.random((6, 3))
-      y_np = model.predict([x_np, s_np, c_np])
-      weights = model.get_weights()
-      config = layer.get_config()
-      custom_objects = {'RNNCellWithConstants': RNNCellWithConstants}
-      with keras.utils.CustomObjectScope(custom_objects):
-        layer = keras.layers.RNN.from_config(config.copy())
-      y = layer(x, initial_state=s, constants=c)
-      model = keras.models.Model([x, s, c], y)
-      model.set_weights(weights)
-      y_np_2 = model.predict([x_np, s_np, c_np])
-      self.assertAllClose(y_np, y_np_2, atol=1e-4)
-
-      # verify that state is used
-      y_np_2_different_s = model.predict([x_np, s_np + 10., c_np])
-      with self.assertRaises(AssertionError):
-        self.assertAllClose(y_np, y_np_2_different_s, atol=1e-4)
-
-    with self.test_session():
-      # test flat list inputs
-      with keras.utils.CustomObjectScope(custom_objects):
-        layer = keras.layers.RNN.from_config(config.copy())
-      y = layer([x, s, c])
-      model = keras.models.Model([x, s, c], y)
-      model.set_weights(weights)
-      y_np_3 = model.predict([x_np, s_np, c_np])
-      self.assertAllClose(y_np, y_np_3, atol=1e-4)
-
-  def test_stacked_rnn_attributes(self):
-    cells = [keras.layers.LSTMCell(3),
-             keras.layers.LSTMCell(3, kernel_regularizer='l2')]
-    layer = keras.layers.RNN(cells)
-    layer.build((None, None, 5))
-
-    # Test regularization losses
-    assert len(layer.losses) == 1
-
-    # Test weights
-    assert len(layer.trainable_weights) == 6
-    cells[0].trainable = False
-    assert len(layer.trainable_weights) == 3
-    assert len(layer.non_trainable_weights) == 3
-
-    # Test `get_losses_for`
-    x = keras.Input((None, 5))
-    y = keras.backend.sum(x)
-    cells[0].add_loss(y, inputs=x)
-    assert layer.get_losses_for(x) == [y]
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/keras/_impl/keras/layers/simplernn_test.py b/tensorflow/python/keras/_impl/keras/layers/simplernn_test.py
index 7edebdacd0..9833485236 100644
--- a/tensorflow/python/keras/_impl/keras/layers/simplernn_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/simplernn_test.py
@@ -156,10 +156,8 @@ class SimpleRNNLayerTest(test.TestCase):
           activity_regularizer='l1')
       layer.build((None, None, 2))
       self.assertEqual(len(layer.losses), 3)
-
-      x = keras.backend.variable(np.ones((2, 3, 2)))
-      layer(x)
-      self.assertEqual(len(layer.get_losses_for(x)), 1)
+      layer(keras.backend.variable(np.ones((2, 3, 2))))
+      self.assertEqual(len(layer.losses), 4)
 
   def test_constraints_SimpleRNN(self):
     embedding_dim = 4
@@ -177,9 +175,9 @@ class SimpleRNNLayerTest(test.TestCase):
           recurrent_constraint=r_constraint,
           bias_constraint=b_constraint)
       layer.build((None, None, embedding_dim))
-      self.assertEqual(layer.cell.kernel.constraint, k_constraint)
-      self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
-      self.assertEqual(layer.cell.bias.constraint, b_constraint)
+      self.assertEqual(layer.kernel.constraint, k_constraint)
+      self.assertEqual(layer.recurrent_kernel.constraint, r_constraint)
+      self.assertEqual(layer.bias.constraint, b_constraint)
 
   def test_with_masking_layer_SimpleRNN(self):
     layer_class = keras.layers.SimpleRNN
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index b94bf8f0f6..acf0a5e179 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -134,11 +134,6 @@ from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPool2D
 from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPool3D
 
 # Recurrent layers.
-from tensorflow.python.keras._impl.keras.layers.recurrent import RNN
-from tensorflow.python.keras._impl.keras.layers.recurrent import StackedRNNCells
-from tensorflow.python.keras._impl.keras.layers.recurrent import SimpleRNNCell
-from tensorflow.python.keras._impl.keras.layers.recurrent import GRUCell
-from tensorflow.python.keras._impl.keras.layers.recurrent import LSTMCell
 from tensorflow.python.keras._impl.keras.layers.recurrent import SimpleRNN
 from tensorflow.python.keras._impl.keras.layers.recurrent import GRU
 from tensorflow.python.keras._impl.keras.layers.recurrent import LSTM
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 8d6f863a4c..7fa504e85e 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1186,7 +1186,6 @@ cuda_py_test(
     srcs = ["check_ops_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
-        "//tensorflow/python/eager:context",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 6eb9c66d06..76b80e60ea 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -107,22 +107,41 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
   def setUp(self):
     self.rng = np.random.RandomState(42)
 
-  def CheckVersusNumpy(self, ndims_mask, arr_shape, make_mask=None):
+  def CheckVersusNumpy(self, ndims_mask, arr_shape, make_mask=None, axis=None):
     """Check equivalence between boolean_mask and numpy masking."""
     if make_mask is None:
       make_mask = lambda shape: self.rng.randint(0, 2, size=shape).astype(bool)
     arr = np.random.rand(*arr_shape)
     mask = make_mask(arr_shape[:ndims_mask])
-    masked_arr = arr[mask]
-    with self.test_session():
-      masked_tensor = array_ops.boolean_mask(arr, mask)
+    if axis is not None:
+      mask = make_mask(arr_shape[axis:ndims_mask+axis])
+    if axis is None or axis == 0:
+      masked_arr = arr[mask]
+    elif axis == 1:
+      masked_arr = arr[:,mask]
+    elif axis == 2:
+      masked_arr = arr[:,:,mask]
+    with self.test_session() as sess:
+      masked_tensor = array_ops.boolean_mask(arr, mask, axis=axis)
 
       # Leading dimension size of masked_tensor is always unknown until runtime
       # since we don't how many elements will be kept.
-      self.assertAllEqual(masked_tensor.get_shape()[1:], masked_arr.shape[1:])
+      leading = 1 if axis is None else axis + 1
+      self.assertAllEqual(masked_tensor.get_shape()[leading:],
+          masked_arr.shape[leading:])
 
       self.assertAllClose(masked_arr, masked_tensor.eval())
 
+  def testMaskDim1ArrDim2Axis1(self):
+    ndims_mask = 1
+    for arr_shape in [(1, 1), (2, 2), (2, 5)]:
+      self.CheckVersusNumpy(ndims_mask, arr_shape, axis=1)
+
+  def testMaskDim2ArrDim2Axis1(self):
+    ndims_mask = 2
+    for arr_shape in [(1, 1), (2, 2), (2, 5)]:
+      self.CheckVersusNumpy(ndims_mask, arr_shape, axis=1)
+
   def testMaskDim1ArrDim1(self):
     ndims_mask = 1
     for arr_shape in [(1,), (2,), (3,), (10,)]:
@@ -486,7 +505,7 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
         _ = checker2[...]
         _ = checker2[tuple()]
 
-  def testFloatSlicedArrayAndInt64IndicesGPU(self):
+  def testInt64GPU(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
     with self.test_session(use_gpu=True, force_gpu=True):
@@ -497,17 +516,6 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       s = array_ops.strided_slice(x, begin, end, strides)
       self.assertAllEqual([3.], self.evaluate(s))
 
-  def testInt64SlicedArrayAndIndicesGPU(self):
-    if not test_util.is_gpu_available():
-      self.skipTest("No GPU available")
-    with self.test_session(use_gpu=True, force_gpu=True):
-      x = constant_op.constant([1, 2, 3], dtype=dtypes.int64)
-      begin = constant_op.constant([2], dtype=dtypes.int64)
-      end = constant_op.constant([3], dtype=dtypes.int64)
-      strides = constant_op.constant([1], dtype=dtypes.int64)
-      s = array_ops.strided_slice(x, begin, end, strides)
-      self.assertAllEqual([3], self.evaluate(s))
-
   def testDegenerateSlices(self):
     with self.test_session(use_gpu=True):
       checker = StridedSliceChecker(self, StridedSliceChecker.REF_TENSOR)
@@ -1070,6 +1078,16 @@ class PadTest(test_util.TensorFlowTestCase):
                            [0, 0, 4, 5, 6, 0, 0],
                            [0, 0, 0, 0, 0, 0, 0]])
 
+class InvertPermutationTest(test_util.TensorFlowTestCase):
+
+  def testInvertPermutation(self):
+    for dtype in [dtypes.int32, dtypes.int64]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([3, 4, 0, 2, 1], dtype=dtype)
+        y = array_ops.invert_permutation(x)
+        self.assertAllEqual(y.get_shape(), [5])
+        self.assertAllEqual(y.eval(), [2, 4, 3, 0, 1])
+
 
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/python/kernel_tests/bincount_op_test.py b/tensorflow/python/kernel_tests/bincount_op_test.py
index 7a610debd1..79285476b4 100644
--- a/tensorflow/python/kernel_tests/bincount_op_test.py
+++ b/tensorflow/python/kernel_tests/bincount_op_test.py
@@ -25,11 +25,10 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import googletest
 
-
 class BincountTest(test_util.TensorFlowTestCase):
 
   def test_empty(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       self.assertAllEqual(
           math_ops.bincount([], minlength=5).eval(), [0, 0, 0, 0, 0])
       self.assertAllEqual(math_ops.bincount([], minlength=1).eval(), [0])
@@ -42,7 +41,7 @@ class BincountTest(test_util.TensorFlowTestCase):
           np.float64)
 
   def test_values(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       self.assertAllEqual(
           math_ops.bincount([1, 1, 1, 2, 2, 3]).eval(), [0, 3, 2, 1])
       arr = [1, 1, 2, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5]
@@ -57,14 +56,14 @@ class BincountTest(test_util.TensorFlowTestCase):
           math_ops.bincount(np.arange(10000)).eval(), np.ones(10000))
 
   def test_maxlength(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       self.assertAllEqual(math_ops.bincount([5], maxlength=3).eval(), [0, 0, 0])
       self.assertAllEqual(math_ops.bincount([1], maxlength=3).eval(), [0, 1])
       self.assertAllEqual(math_ops.bincount([], maxlength=3).eval(), [])
 
   def test_random_with_weights(self):
     num_samples = 10000
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       np.random.seed(42)
       for dtype in [dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64]:
         arr = np.random.randint(0, 1000, num_samples)
@@ -72,17 +71,29 @@ class BincountTest(test_util.TensorFlowTestCase):
           weights = np.random.randint(-100, 100, num_samples)
         else:
           weights = np.random.random(num_samples)
-        self.assertAllEqual(
+        self.assertAllClose(
             math_ops.bincount(arr, weights).eval(),
             np.bincount(arr, weights))
 
+  def test_random_without_weights(self):
+    num_samples = 10000
+    with self.test_session(use_gpu=True):
+      np.random.seed(42)
+      for dtype in [np.int32, np.float32]:
+        arr = np.random.randint(0, 1000, num_samples)
+        weights = np.ones(num_samples).astype(dtype)
+        self.assertAllClose(
+            math_ops.bincount(arr, None).eval(),
+            np.bincount(arr, weights))
+
   def test_zero_weights(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       self.assertAllEqual(
           math_ops.bincount(np.arange(1000), np.zeros(1000)).eval(),
           np.zeros(1000))
 
   def test_negative(self):
+    # unsorted_segment_sum will only report InvalidArgumentError on CPU
     with self.test_session():
       with self.assertRaises(errors.InvalidArgumentError):
         math_ops.bincount([1, 2, 3, -1, 6, 8]).eval()
diff --git a/tensorflow/python/kernel_tests/bucketize_op_test.py b/tensorflow/python/kernel_tests/bucketize_op_test.py
index 6db3592055..e612b1c134 100644
--- a/tensorflow/python/kernel_tests/bucketize_op_test.py
+++ b/tensorflow/python/kernel_tests/bucketize_op_test.py
@@ -31,7 +31,7 @@ class BucketizationOpTest(test.TestCase):
         constant_op.constant([-5, 0, 2, 3, 5, 8, 10, 11, 12]),
         boundaries=[0, 3, 8, 11])
     expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4]
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       self.assertAllEqual(expected_out, sess.run(op))
 
   def testFloat(self):
@@ -39,7 +39,7 @@ class BucketizationOpTest(test.TestCase):
         constant_op.constant([-5., 0., 2., 3., 5., 8., 10., 11., 12.]),
         boundaries=[0., 3., 8., 11.])
     expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4]
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       self.assertAllEqual(expected_out, sess.run(op))
 
   def test2DInput(self):
@@ -47,13 +47,13 @@ class BucketizationOpTest(test.TestCase):
         constant_op.constant([[-5, 0, 2, 3, 5], [8, 10, 11, 12, 0]]),
         boundaries=[0, 3, 8, 11])
     expected_out = [[0, 1, 1, 2, 2], [3, 3, 4, 4, 1]]
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       self.assertAllEqual(expected_out, sess.run(op))
 
   def testInvalidBoundariesOrder(self):
     op = math_ops._bucketize(
         constant_op.constant([-5, 0]), boundaries=[0, 8, 3, 11])
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       with self.assertRaisesRegexp(
           errors_impl.InvalidArgumentError, "Expected sorted boundaries"):
         sess.run(op)
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index 43785adcee..ed859e3774 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -20,13 +20,10 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.platform import test
@@ -74,178 +71,110 @@ class AssertProperIterableTest(test.TestCase):
 
 class AssertEqualTest(test.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_equal(self):
-    small = constant_op.constant([1, 2], name="small")
-    with ops.control_dependencies([check_ops.assert_equal(small, small)]):
-      out = array_ops.identity(small)
-    self.evaluate(out)
-
-  def test_returns_none_with_eager(self):
-    with context.eager_mode():
+    with self.test_session():
       small = constant_op.constant([1, 2], name="small")
-      x = check_ops.assert_equal(small, small)
-      assert x is None
+      with ops.control_dependencies([check_ops.assert_equal(small, small)]):
+        out = array_ops.identity(small)
+      out.eval()
 
-  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_greater(self):
-    # Static check
-    static_small = constant_op.constant([1, 2], name="small")
-    static_big = constant_op.constant([3, 4], name="big")
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, "fail"):
-      check_ops.assert_equal(static_big, static_small, message="fail")
-
-    # Dynamic check
-    if context.in_graph_mode():
-      with self.test_session():
-        small = array_ops.placeholder(dtypes.int32, name="small")
-        big = array_ops.placeholder(dtypes.int32, name="big")
-        with ops.control_dependencies(
-            [check_ops.assert_equal(
-                big, small, message="fail")]):
-          out = array_ops.identity(small)
-        with self.assertRaisesOpError("fail.*big.*small"):
-          out.eval(feed_dict={small: [1, 2], big: [3, 4]})
-
-  def test_error_message_eager(self):
-    expected_error_msg_full = r"""big does not equal small
-Condition x == y did not hold.
-Indices of first 6 different values:
-\[\[0 0\]
- \[1 1\]
- \[2 0\]\]
-Corresponding x values:
-\[2 3 6\]
-Corresponding y values:
-\[20 30 60\]
-First 6 elements of x:
-\[2 2 3 3 6 6\]
-First 6 elements of y:
-\[20  2  3 30 60  6\]
-"""
-    expected_error_msg_short = r"""big does not equal small
-Condition x == y did not hold.
-Indices of first 2 different values:
-\[\[0 0\]
- \[1 1\]\]
-Corresponding x values:
-\[2 3\]
-Corresponding y values:
-\[20 30\]
-First 2 elements of x:
-\[2 2\]
-First 2 elements of y:
-\[20  2\]
-"""
-    with context.eager_mode():
-      big = constant_op.constant([[2, 2], [3, 3], [6, 6]])
-      small = constant_op.constant([[20, 2], [3, 30], [60, 6]])
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   expected_error_msg_full):
-        check_ops.assert_equal(big, small, message="big does not equal small",
-                               summarize=10)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   expected_error_msg_short):
-        check_ops.assert_equal(big, small, message="big does not equal small",
-                               summarize=2)
-
-  @test_util.run_in_graph_and_eager_modes()
+    with self.test_session():
+      # Static check
+      static_small = constant_op.constant([1, 2], name="small")
+      static_big = constant_op.constant([3, 4], name="big")
+      with self.assertRaisesRegexp(ValueError, "fail"):
+        check_ops.assert_equal(static_big, static_small, message="fail")
+      # Dynamic check
+      small = array_ops.placeholder(dtypes.int32, name="small")
+      big = array_ops.placeholder(dtypes.int32, name="big")
+      with ops.control_dependencies(
+          [check_ops.assert_equal(
+              big, small, message="fail")]):
+        out = array_ops.identity(small)
+      with self.assertRaisesOpError("fail.*big.*small"):
+        out.eval(feed_dict={small: [1, 2], big: [3, 4]})
+
   def test_raises_when_less(self):
-    # Static check
-    static_small = constant_op.constant([3, 1], name="small")
-    static_big = constant_op.constant([4, 2], name="big")
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, "fail"):
-      check_ops.assert_equal(static_big, static_small, message="fail")
-
-    # Dynamic check
-    if context.in_graph_mode():
-      with self.test_session():
-        small = array_ops.placeholder(dtypes.int32, name="small")
-        big = array_ops.placeholder(dtypes.int32, name="big")
-        with ops.control_dependencies([check_ops.assert_equal(small, big)]):
-          out = array_ops.identity(small)
-        with self.assertRaisesOpError("small.*big"):
-          out.eval(feed_dict={small: [3, 1], big: [4, 2]})
+    with self.test_session():
+      # Static check
+      static_small = constant_op.constant([3, 1], name="small")
+      static_big = constant_op.constant([4, 2], name="big")
+      with self.assertRaisesRegexp(ValueError, "fail"):
+        check_ops.assert_equal(static_big, static_small, message="fail")
+      # Dynamic check
+      small = array_ops.placeholder(dtypes.int32, name="small")
+      big = array_ops.placeholder(dtypes.int32, name="big")
+      with ops.control_dependencies([check_ops.assert_equal(small, big)]):
+        out = array_ops.identity(small)
+      with self.assertRaisesOpError("small.*big"):
+        out.eval(feed_dict={small: [3, 1], big: [4, 2]})
 
-  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_equal_and_broadcastable_shapes(self):
-    small = constant_op.constant([[1, 2], [1, 2]], name="small")
-    small_2 = constant_op.constant([1, 2], name="small_2")
-    with ops.control_dependencies([check_ops.assert_equal(small, small_2)]):
-      out = array_ops.identity(small)
-    self.evaluate(out)
-
-  @test_util.run_in_graph_and_eager_modes()
-  def test_raises_when_equal_but_non_broadcastable_shapes(self):
-    small = constant_op.constant([1, 1, 1], name="small")
-    small_2 = constant_op.constant([1, 1], name="small_2")
-    # The exception in eager and non-eager mode is different because
-    # eager mode relies on shape check done as part of the C++ op, while
-    # graph mode does shape checks when creating the `Operation` instance.
-    with self.assertRaisesRegexp(
-        (errors.InvalidArgumentError, ValueError),
-        (r"Incompatible shapes: \[3\] vs. \[2\]|"
-         r"Dimensions must be equal, but are 3 and 2")):
+    with self.test_session():
+      small = constant_op.constant([1, 2], name="small")
+      small_2 = constant_op.constant([1, 2], name="small_2")
       with ops.control_dependencies([check_ops.assert_equal(small, small_2)]):
         out = array_ops.identity(small)
-      self.evaluate(out)
+      out.eval()
+
+  def test_raises_when_equal_but_non_broadcastable_shapes(self):
+    with self.test_session():
+      small = constant_op.constant([1, 1, 1], name="small")
+      small_2 = constant_op.constant([1, 1], name="small_2")
+      with self.assertRaisesRegexp(ValueError, "must be"):
+        with ops.control_dependencies([check_ops.assert_equal(small, small_2)]):
+          out = array_ops.identity(small)
+        out.eval()
 
-  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_both_empty(self):
-    larry = constant_op.constant([])
-    curly = constant_op.constant([])
-    with ops.control_dependencies([check_ops.assert_equal(larry, curly)]):
-      out = array_ops.identity(larry)
-    self.evaluate(out)
+    with self.test_session():
+      larry = constant_op.constant([])
+      curly = constant_op.constant([])
+      with ops.control_dependencies([check_ops.assert_equal(larry, curly)]):
+        out = array_ops.identity(larry)
+      out.eval()
 
 
 class AssertNoneEqualTest(test.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_not_equal(self):
-    small = constant_op.constant([1, 2], name="small")
-    big = constant_op.constant([10, 20], name="small")
-    with ops.control_dependencies(
-        [check_ops.assert_none_equal(big, small)]):
-      out = array_ops.identity(small)
-    self.evaluate(out)
-
-  @test_util.run_in_graph_and_eager_modes()
+    with self.test_session():
+      small = constant_op.constant([1, 2], name="small")
+      big = constant_op.constant([10, 20], name="small")
+      with ops.control_dependencies(
+          [check_ops.assert_none_equal(big, small)]):
+        out = array_ops.identity(small)
+      out.eval()
+
   def test_raises_when_equal(self):
-    small = constant_op.constant([3, 1], name="small")
-    with self.assertRaisesOpError("x != y did not hold"):
+    with self.test_session():
+      small = constant_op.constant([3, 1], name="small")
       with ops.control_dependencies(
           [check_ops.assert_none_equal(small, small)]):
         out = array_ops.identity(small)
-      self.evaluate(out)
+      with self.assertRaisesOpError("x != y did not hold"):
+        out.eval()
 
-  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_not_equal_and_broadcastable_shapes(self):
-    small = constant_op.constant([1, 2], name="small")
-    big = constant_op.constant([3], name="big")
-    with ops.control_dependencies(
-        [check_ops.assert_none_equal(small, big)]):
-      out = array_ops.identity(small)
-    self.evaluate(out)
-
-  @test_util.run_in_graph_and_eager_modes()
+    with self.test_session():
+      small = constant_op.constant([1, 2], name="small")
+      big = constant_op.constant([3], name="big")
+      with ops.control_dependencies(
+          [check_ops.assert_none_equal(small, big)]):
+        out = array_ops.identity(small)
+      out.eval()
+
   def test_raises_when_not_equal_but_non_broadcastable_shapes(self):
     with self.test_session():
       small = constant_op.constant([1, 1, 1], name="small")
       big = constant_op.constant([10, 10], name="big")
-      # The exception in eager and non-eager mode is different because
-      # eager mode relies on shape check done as part of the C++ op, while
-      # graph mode does shape checks when creating the `Operation` instance.
-      with self.assertRaisesRegexp(
-          (ValueError, errors.InvalidArgumentError),
-          (r"Incompatible shapes: \[3\] vs. \[2\]|"
-           r"Dimensions must be equal, but are 3 and 2")):
+      with self.assertRaisesRegexp(ValueError, "must be"):
         with ops.control_dependencies(
             [check_ops.assert_none_equal(small, big)]):
           out = array_ops.identity(small)
-        self.evaluate(out)
+        out.eval()
 
-  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_both_empty(self):
     with self.test_session():
       larry = constant_op.constant([])
@@ -253,82 +182,62 @@ class AssertNoneEqualTest(test.TestCase):
       with ops.control_dependencies(
           [check_ops.assert_none_equal(larry, curly)]):
         out = array_ops.identity(larry)
-      self.evaluate(out)
-
-  def test_returns_none_with_eager(self):
-    with context.eager_mode():
-      t1 = constant_op.constant([1, 2])
-      t2 = constant_op.constant([3, 4])
-      x = check_ops.assert_none_equal(t1, t2)
-      assert x is None
+      out.eval()
 
 
 class AssertLessTest(test.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_equal(self):
-    small = constant_op.constant([1, 2], name="small")
-    with self.assertRaisesOpError("failure message.*\n*.* x < y did not hold"):
+    with self.test_session():
+      small = constant_op.constant([1, 2], name="small")
       with ops.control_dependencies(
           [check_ops.assert_less(
-              small, small, message="failure message")]):
+              small, small, message="fail")]):
         out = array_ops.identity(small)
-      self.evaluate(out)
+      with self.assertRaisesOpError("fail.*small.*small"):
+        out.eval()
 
-  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_greater(self):
-    small = constant_op.constant([1, 2], name="small")
-    big = constant_op.constant([3, 4], name="big")
-    with self.assertRaisesOpError("x < y did not hold"):
+    with self.test_session():
+      small = constant_op.constant([1, 2], name="small")
+      big = constant_op.constant([3, 4], name="big")
       with ops.control_dependencies([check_ops.assert_less(big, small)]):
         out = array_ops.identity(small)
-      self.evaluate(out)
+      with self.assertRaisesOpError("big.*small"):
+        out.eval()
 
-  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_less(self):
-    small = constant_op.constant([3, 1], name="small")
-    big = constant_op.constant([4, 2], name="big")
-    with ops.control_dependencies([check_ops.assert_less(small, big)]):
-      out = array_ops.identity(small)
-    self.evaluate(out)
+    with self.test_session():
+      small = constant_op.constant([3, 1], name="small")
+      big = constant_op.constant([4, 2], name="big")
+      with ops.control_dependencies([check_ops.assert_less(small, big)]):
+        out = array_ops.identity(small)
+      out.eval()
 
-  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_less_and_broadcastable_shapes(self):
-    small = constant_op.constant([1], name="small")
-    big = constant_op.constant([3, 2], name="big")
-    with ops.control_dependencies([check_ops.assert_less(small, big)]):
-      out = array_ops.identity(small)
-    self.evaluate(out)
-
-  @test_util.run_in_graph_and_eager_modes()
-  def test_raises_when_less_but_non_broadcastable_shapes(self):
-    small = constant_op.constant([1, 1, 1], name="small")
-    big = constant_op.constant([3, 2], name="big")
-    # The exception in eager and non-eager mode is different because
-    # eager mode relies on shape check done as part of the C++ op, while
-    # graph mode does shape checks when creating the `Operation` instance.
-    with self.assertRaisesRegexp(
-        (ValueError, errors.InvalidArgumentError),
-        (r"Incompatible shapes: \[3\] vs. \[2\]|"
-         "Dimensions must be equal, but are 3 and 2")):
+    with self.test_session():
+      small = constant_op.constant([1], name="small")
+      big = constant_op.constant([3, 2], name="big")
       with ops.control_dependencies([check_ops.assert_less(small, big)]):
         out = array_ops.identity(small)
-      self.evaluate(out)
+      out.eval()
+
+  def test_raises_when_less_but_non_broadcastable_shapes(self):
+    with self.test_session():
+      small = constant_op.constant([1, 1, 1], name="small")
+      big = constant_op.constant([3, 2], name="big")
+      with self.assertRaisesRegexp(ValueError, "must be"):
+        with ops.control_dependencies([check_ops.assert_less(small, big)]):
+          out = array_ops.identity(small)
+        out.eval()
 
-  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_both_empty(self):
-    larry = constant_op.constant([])
-    curly = constant_op.constant([])
-    with ops.control_dependencies([check_ops.assert_less(larry, curly)]):
-      out = array_ops.identity(larry)
-    self.evaluate(out)
-
-  def test_returns_none_with_eager(self):
-    with context.eager_mode():
-      t1 = constant_op.constant([1, 2])
-      t2 = constant_op.constant([3, 4])
-      x = check_ops.assert_less(t1, t2)
-      assert x is None
+    with self.test_session():
+      larry = constant_op.constant([])
+      curly = constant_op.constant([])
+      with ops.control_dependencies([check_ops.assert_less(larry, curly)]):
+        out = array_ops.identity(larry)
+      out.eval()
 
 
 class AssertLessEqualTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index 6167cb9999..6cbdd4cbb3 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -439,9 +439,10 @@ class ZerosLikeTest(test.TestCase):
 
   def testZerosLikeCPU(self):
     for dtype in [
-        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int32,
-        dtypes_lib.uint8, dtypes_lib.int16, dtypes_lib.int8,
-        dtypes_lib.complex64, dtypes_lib.complex128, dtypes_lib.int64,
+        dtypes_lib.float32, dtypes_lib.float64,
+        dtypes_lib.int8, dtypes_lib.uint8, dtypes_lib.int16, dtypes_lib.uint16,
+        dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.bool,
+        dtypes_lib.complex64, dtypes_lib.complex128,
         dtypes_lib.string
     ]:
       self._compareZeros(dtype, fully_defined_shape=False, use_gpu=False)
@@ -573,9 +574,10 @@ class OnesLikeTest(test.TestCase):
 
   def testOnesLike(self):
     for dtype in [
-        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int32,
-        dtypes_lib.uint8, dtypes_lib.int16, dtypes_lib.int8,
-        dtypes_lib.complex64, dtypes_lib.complex128, dtypes_lib.int64
+        dtypes_lib.float32, dtypes_lib.float64,
+        dtypes_lib.int8, dtypes_lib.uint8, dtypes_lib.int16, dtypes_lib.uint16,
+        dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.bool,
+        dtypes_lib.complex64, dtypes_lib.complex128
     ]:
       numpy_dtype = dtype.as_numpy_dtype
       with self.test_session():
diff --git a/tensorflow/python/kernel_tests/conv1d_test.py b/tensorflow/python/kernel_tests/conv1d_test.py
index 662c94eea7..7c8d309bbd 100644
--- a/tensorflow/python/kernel_tests/conv1d_test.py
+++ b/tensorflow/python/kernel_tests/conv1d_test.py
@@ -17,6 +17,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
@@ -50,5 +53,45 @@ class Conv1DTest(test.TestCase):
           self.assertAllClose(output, [2 * 1 + 1 * 2, 2 * 3 + 1 * 4])
 
 
+  def testConv1DTranspose(self):
+    with self.test_session():
+      stride = 2
+
+      # Input, output: [batch, width, depth]
+      x_shape = [2, 4, 3]
+      y_shape = [2, 9, 2]
+
+      # Filter: [kernel_width, output_depth, input_depth]
+      f_shape = [3, 2, 3]
+
+      x = constant_op.constant(
+          1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+      f = constant_op.constant(
+          1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+      output = nn_ops.conv1d_transpose(
+          x, f, y_shape, stride=stride, padding="VALID")
+      value = output.eval()
+
+      cache_values = np.zeros(y_shape, dtype=np.float32)
+
+      # The amount of padding added
+      pad = 1
+
+      for n in xrange(x_shape[0]):
+        for k in xrange(f_shape[1]):
+          for w in xrange(pad, y_shape[1] - pad):
+            target = 3.0
+            # We add a case for locations divisible by the stride.
+            w_in = w % stride == 0 and w > pad and w < y_shape[1] - 1 - pad
+            if w_in:
+              target += 3.0
+            cache_values[n, w, k] = target
+
+          # copy values in the border
+          cache_values[n, 0, k] = cache_values[n, 1, k]
+          cache_values[n, -1, k] = cache_values[n, -2, k]
+
+    self.assertAllClose(cache_values, value)
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
index 14622ab467..116681fc4c 100644
--- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 import collections
 import math
 
+import numpy as np
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
@@ -45,8 +47,19 @@ def GetTestConfigs():
 
 class Conv3DTest(test.TestCase):
 
+  def _DtypesToTest(self, use_gpu):
+    if use_gpu:
+      if not test_util.CudaSupportsHalfMatMulAndConv():
+        return [dtypes.float32]
+      else:
+        # It is important that float32 comes before float16 here,
+        # as we will be using its gradients as reference for fp16 gradients.
+        return [dtypes.float32, dtypes.float16]
+    else:
+      return [dtypes.float64, dtypes.float32, dtypes.float16]
+
   def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, stride,
-                            padding, data_format, use_gpu):
+                            padding, data_format, dtype, use_gpu):
     total_size_1 = 1
     total_size_2 = 1
     for s in tensor_in_sizes:
@@ -54,13 +67,14 @@ class Conv3DTest(test.TestCase):
     for s in filter_in_sizes:
       total_size_2 *= s
 
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
-    x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
+    # Initializes the input tensor with array containing numbers from 0 to 1.
+    # We keep the input tensor values fairly small to avoid overflowing a float16 
+    # tensor during the conv3d 
+    x1 = [f * 1.0 / total_size_1 for f in range(1, total_size_1 + 1)]
+    x2 = [f * 1.0 / total_size_2 for f in range(1, total_size_2 + 1)]
     with self.test_session(use_gpu=use_gpu):
-      t1 = constant_op.constant(x1, shape=tensor_in_sizes)
-      t2 = constant_op.constant(x2, shape=filter_in_sizes)
+      t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype)
+      t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype)
 
       if isinstance(stride, collections.Iterable):
         strides = [1] + list(stride) + [1]
@@ -81,27 +95,35 @@ class Conv3DTest(test.TestCase):
                     expected):
     results = []
     for data_format, use_gpu in GetTestConfigs():
-      result = self._SetupValuesForDevice(
-          tensor_in_sizes,
-          filter_in_sizes,
-          stride,
-          padding,
-          data_format,
-          use_gpu=use_gpu)
-      results.append(result)
-      tolerance = 1e-2 if use_gpu else 1e-5
+      for dtype in self._DtypesToTest(use_gpu):
+        result = self._SetupValuesForDevice(
+            tensor_in_sizes,
+            filter_in_sizes,
+            stride,
+            padding,
+            data_format,
+            dtype,
+            use_gpu=use_gpu)
+        results.append(result)
+
       with self.test_session() as sess:
         values = sess.run(results)
         for value in values:
           print("expected = ", expected)
           print("actual = ", value)
-          self.assertAllClose(expected, value.flatten(), atol=tolerance,
-                              rtol=1e-6)
+          tol = 1e-6
+          if value.dtype == np.float16:
+            tol = 1e-3
+
+          self.assertAllClose(expected, value.flatten(), atol=tol,
+                              rtol=tol)
 
   def testConv3D1x1x1Filter(self):
     expected_output = [
-        30.0, 36.0, 42.0, 66.0, 81.0, 96.0, 102.0, 126.0, 150.0, 138.0, 171.0,
-        204.0, 174.0, 216.0, 258.0, 210.0, 261.0, 312.0
+        0.18518519,  0.22222222,  0.25925926,  0.40740741,  0.5       ,
+        0.59259259,  0.62962963,  0.77777778,  0.92592593,  0.85185185,
+        1.05555556,  1.25925926,  1.07407407,  1.33333333,  1.59259259,
+        1.2962963 ,  1.61111111,  1.92592593
     ]
 
     # These are equivalent to the Conv2D1x1 case.
@@ -127,8 +149,10 @@ class Conv3DTest(test.TestCase):
   # Expected values computed using scipy's correlate function.
   def testConv3D2x2x2Filter(self):
     expected_output = [
-        19554., 19962., 20370., 22110., 22590., 23070., 34890., 35730., 36570.,
-        37446., 38358., 39270., 50226., 51498., 52770., 52782., 54126., 55470.
+        3.77199074,   3.85069444,   3.92939815,   4.2650463 ,   4.35763889,
+        4.45023148,   6.73032407,   6.89236111,   7.05439815,   7.22337963,
+        7.39930556,   7.57523148,   9.68865741,   9.93402778,  10.17939815,
+        10.18171296,  10.44097222,  10.70023148
     ]
     # expected_shape = [1, 3, 1, 2, 5]
     self._VerifyValues(
@@ -140,69 +164,19 @@ class Conv3DTest(test.TestCase):
 
   def testConv3DStrides(self):
     expected_output = [
-        102.,
-        151.,
-        172.,
-        193.,
-        214.,
-        235.,
-        142.,
-        438.,
-        592.,
-        613.,
-        634.,
-        655.,
-        676.,
-        394.,
-        774.,
-        1033.,
-        1054.,
-        1075.,
-        1096.,
-        1117.,
-        646.,
-        1894.,
-        2503.,
-        2524.,
-        2545.,
-        2566.,
-        2587.,
-        1486.,
-        2230.,
-        2944.,
-        2965.,
-        2986.,
-        3007.,
-        3028.,
-        1738.,
-        2566.,
-        3385.,
-        3406.,
-        3427.,
-        3448.,
-        3469.,
-        1990.,
-        3686.,
-        4855.,
-        4876.,
-        4897.,
-        4918.,
-        4939.,
-        2830.,
-        4022.,
-        5296.,
-        5317.,
-        5338.,
-        5359.,
-        5380.,
-        3082.,
-        4358.,
-        5737.,
-        5758.,
-        5779.,
-        5800.,
-        5821.,
-        3334.,
+        0.06071429,  0.08988095,  0.10238095,  0.11488095,  0.12738095,
+        0.13988095,  0.08452381,  0.26071429,  0.35238095,  0.36488095,
+        0.37738095,  0.38988095,  0.40238095,  0.23452381,  0.46071429,
+        0.61488095,  0.62738095,  0.63988095,  0.65238095,  0.66488095,
+        0.38452381,  1.12738095,  1.48988095,  1.50238095,  1.51488095,
+        1.52738095,  1.53988095,  0.88452381,  1.32738095,  1.75238095,
+        1.76488095,  1.77738095,  1.78988095,  1.80238095,  1.03452381,
+        1.52738095,  2.01488095,  2.02738095,  2.03988095,  2.05238095,
+        2.06488095,  1.18452381,  2.19404762,  2.88988095,  2.90238095,
+        2.91488095,  2.92738095,  2.93988095,  1.68452381,  2.39404762,
+        3.15238095,  3.16488095,  3.17738095,  3.18988095,  3.20238095,
+        1.83452381,  2.59404762,  3.41488095,  3.42738095,  3.43988095,
+        3.45238095,  3.46488095,  1.98452381
     ]
     self._VerifyValues(
         tensor_in_sizes=[1, 5, 8, 7, 1],
@@ -212,7 +186,10 @@ class Conv3DTest(test.TestCase):
         expected=expected_output)
 
   def testConv3D2x2x2FilterStride2(self):
-    expected_output = [19554., 19962., 20370., 50226., 51498., 52770.]
+    expected_output = [
+        3.77199074,  3.85069444,  3.92939815,  9.68865741,  9.93402778,
+        10.17939815
+    ]
     self._VerifyValues(
         tensor_in_sizes=[1, 4, 2, 3, 3],
         filter_in_sizes=[2, 2, 2, 3, 3],
@@ -222,11 +199,14 @@ class Conv3DTest(test.TestCase):
 
   def testConv3DStride3(self):
     expected_output = [
-        36564., 38022., 39480., 37824., 39354., 40884., 39084., 40686., 42288.,
-        46644., 48678., 50712., 47904., 50010., 52116., 49164., 51342., 53520.,
-        107124., 112614., 118104., 108384., 113946., 119508., 109644., 115278.,
-        120912., 117204., 123270., 129336., 118464., 124602., 130740., 119724.,
-        125934., 132144.
+        1.51140873,  1.57167659,  1.63194444,  1.56349206,  1.62673611,
+        1.68998016,  1.6155754 ,  1.68179563,  1.74801587,  1.9280754 ,
+        2.01215278,  2.09623016,  1.98015873,  2.0672123 ,  2.15426587,
+        2.03224206,  2.12227183,  2.21230159,  4.4280754 ,  4.65500992,
+        4.88194444,  4.48015873,  4.71006944,  4.93998016,  4.53224206,
+        4.76512897,  4.99801587,  4.84474206,  5.09548611,  5.34623016,
+        4.8968254 ,  5.15054563,  5.40426587,  4.94890873,  5.20560516,
+        5.46230159
     ]
     self._VerifyValues(
         tensor_in_sizes=[1, 6, 7, 8, 2],
@@ -237,8 +217,9 @@ class Conv3DTest(test.TestCase):
 
   def testConv3D2x2x2FilterStride2Same(self):
     expected_output = [
-        19554., 19962., 20370., 10452., 10710., 10968., 50226., 51498., 52770.,
-        23844., 24534., 25224.
+        3.77199074,   3.85069444,   3.92939815,   2.0162037 ,   2.06597222,
+        2.11574074,   9.68865741,   9.93402778,  10.17939815,   4.59953704,
+        4.73263889,   4.86574074
     ]
     self._VerifyValues(
         tensor_in_sizes=[1, 4, 2, 3, 3],
@@ -248,7 +229,10 @@ class Conv3DTest(test.TestCase):
         expected=expected_output)
 
   def testKernelSmallerThanStride(self):
-    expected_output = [1., 3., 7., 9., 19., 21., 25., 27.]
+    expected_output = [
+        0.03703704,  0.11111111,  0.25925926,  0.33333333,  0.7037037 ,
+        0.77777778,  0.92592593,  1.
+    ]
     self._VerifyValues(
         tensor_in_sizes=[1, 3, 3, 3, 1],
         filter_in_sizes=[1, 1, 1, 1, 1],
@@ -263,9 +247,12 @@ class Conv3DTest(test.TestCase):
         expected=expected_output)
 
     expected_output = [
-        1484., 1592., 770., 2240., 2348., 1106., 1149., 1191., 539., 6776.,
-        6884., 3122., 7532., 7640., 3458., 3207., 3249., 1421., 3005., 3035.,
-        1225., 3215., 3245., 1309., 1013., 1022., 343.
+        0.54081633,  0.58017493,  0.28061224,  0.81632653,  0.85568513,
+        0.40306122,  0.41873178,  0.4340379 ,  0.19642857,  2.46938776,
+        2.50874636,  1.1377551 ,  2.74489796,  2.78425656,  1.26020408,
+        1.16873178,  1.1840379 ,  0.51785714,  1.09511662,  1.10604956,
+        0.44642857,  1.17164723,  1.18258017,  0.47704082,  0.3691691 ,
+        0.37244898,  0.125
     ]
     self._VerifyValues(
         tensor_in_sizes=[1, 7, 7, 7, 1],
@@ -274,7 +261,10 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         expected=expected_output)
 
-    expected_output = [1484., 1592., 2240., 2348., 6776., 6884., 7532., 7640.]
+    expected_output = [
+        0.540816,  0.580175,  0.816327,  0.855685,  2.469388,  2.508746,
+        2.744898,  2.784257
+    ]
     self._VerifyValues(
         tensor_in_sizes=[1, 7, 7, 7, 1],
         filter_in_sizes=[2, 2, 2, 1, 1],
@@ -288,7 +278,7 @@ class Conv3DTest(test.TestCase):
         filter_in_sizes=[2, 1, 2, 1, 2],
         stride=1,
         padding="VALID",
-        expected=[50, 60])
+        expected=[1.5625,  1.875])
 
   def _ConstructAndTestGradientForConfig(
       self, batch, input_shape, filter_shape, in_depth, out_depth, stride,
@@ -328,50 +318,63 @@ class Conv3DTest(test.TestCase):
     input_data = [x * 1.0 / input_size for x in range(0, input_size)]
     filter_data = [x * 1.0 / filter_size for x in range(0, filter_size)]
 
-    if test.is_gpu_available() and use_gpu:
-      data_type = dtypes.float32
+
+    for data_type in self._DtypesToTest(use_gpu=use_gpu):
       # TODO(mjanusz): Modify gradient_checker to also provide max relative
       # error and synchronize the tolerance levels between the tests for forward
       # and backward computations.
-      if test.is_gpu_available():
+      if data_type == dtypes.float64:
+        tolerance = 1e-8
+      elif data_type == dtypes.float32:
         tolerance = 5e-3
-      else:
-        # As of Aug 2016, higher tolerance is needed for some CPU architectures.
-        # Runs on a single machine can also generate slightly different errors
-        # because of multithreading.
-        tolerance = 8e-3
-    else:
-      data_type = dtypes.float64
-      tolerance = 1e-8
-    with self.test_session(use_gpu=use_gpu):
-      orig_input_tensor = constant_op.constant(
+      elif data_type == dtypes.float16:
+        tolerance = 1e-3
+
+
+      with self.test_session(use_gpu=use_gpu):
+        orig_input_tensor = constant_op.constant(
           input_data, shape=input_shape, dtype=data_type, name="input")
-      filter_tensor = constant_op.constant(
+        filter_tensor = constant_op.constant(
           filter_data, shape=filter_shape, dtype=data_type, name="filter")
 
-      if data_format == "NCDHW":
-        input_tensor = test_util.NHWCToNCHW(orig_input_tensor)
-        strides = test_util.NHWCToNCHW(strides)
-      else:
-        input_tensor = orig_input_tensor
+        if data_format == "NCDHW":
+          input_tensor = test_util.NHWCToNCHW(orig_input_tensor)
+          new_strides = test_util.NHWCToNCHW(strides)
+        else:
+          input_tensor = orig_input_tensor
+          new_strides = strides
 
-      conv = nn_ops.conv3d(
-          input_tensor, filter_tensor, strides, padding,
+        conv = nn_ops.conv3d(
+          input_tensor, filter_tensor, new_strides, padding,
           data_format=data_format, name="conv")
 
-      if data_format == "NCDHW":
-        conv = test_util.NCHWToNHWC(conv)
+        if data_format == "NCDHW":
+          conv = test_util.NCHWToNHWC(conv)
+
+        
+        if test_input:
+          jacob_t, jacob_n = gradient_checker.compute_gradient(orig_input_tensor,
+                                                               input_shape,
+                                                               conv,
+                                                               output_shape)
+        else:
+          jacob_t, jacob_n = gradient_checker.compute_gradient(filter_tensor,
+                                                               filter_shape,
+                                                               conv,
+                                                               output_shape)
+        
+        
+        if data_type != dtypes.float16:
+          reference_jacob_t = jacob_t
+          err = np.fabs(jacob_t - jacob_n).max()
+        else:
+          # Compare fp16 theoretical gradients to fp32 theoretical gradients,
+          # since fp16 numerical gradients are too imprecise.
+          err = np.fabs(jacob_t - reference_jacob_t).max()
+
+      print("conv3d gradient error = ", err)
+      self.assertLess(err, tolerance)
 
-      if test_input:
-        err = gradient_checker.compute_gradient_error(orig_input_tensor,
-                                                      input_shape,
-                                                      conv, output_shape)
-      else:
-        err = gradient_checker.compute_gradient_error(filter_tensor,
-                                                      filter_shape, conv,
-                                                      output_shape)
-    print("conv3d gradient error = ", err)
-    self.assertLess(err, tolerance)
 
   def ConstructAndTestGradient(self, **kwargs):
     for data_format, use_gpu in GetTestConfigs():
diff --git a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
index 3298092fbe..f7ae1a0f37 100644
--- a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
@@ -122,7 +122,9 @@ class DepthwiseConv2DTest(test.TestCase):
     x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
     x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
     with self.test_session(use_gpu=use_gpu) as sess:
-      if data_type == dtypes.float32:
+      if data_type == dtypes.float16:
+        tolerance = 1e-5
+      elif data_type == dtypes.float32:
         tolerance = 1e-5
       else:
         self.assertEqual(data_type, dtypes.float64)
@@ -169,7 +171,7 @@ class DepthwiseConv2DTest(test.TestCase):
                 padding) in enumerate(ConfigsToTest()):
       print("Testing DepthwiseConv2D,", index, "th config:", input_size, "*",
             filter_size, "stride:", stride, "padding:", padding)
-      for data_type in [dtypes.float32, dtypes.float64]:
+      for data_type in [dtypes.float16, dtypes.float32, dtypes.float64]:
         self._VerifyValues(
             input_size, filter_size, stride, padding, data_type, use_gpu=True)
 
@@ -181,7 +183,7 @@ class DepthwiseConv2DTest(test.TestCase):
                 padding) in enumerate(ConfigsToTest()):
       print("Testing DepthwiseConv2DFormat,", index, "th config:", input_size,
             "*", filter_size, "stride:", stride, "padding:", padding)
-      for data_type in [dtypes.float32, dtypes.float64]:
+      for data_type in [dtypes.float16, dtypes.float32, dtypes.float64]:
         self._VerifyValues(
             input_size,
             filter_size,
@@ -318,7 +320,9 @@ class DepthwiseConv2DTest(test.TestCase):
     input_data = [x * 1.0 / input_size for x in range(0, input_size)]
     filter_data = [x * 1.0 / filter_size for x in range(0, filter_size)]
     with self.test_session(use_gpu=use_gpu):
-      if data_type == dtypes.float32:
+      if data_type == dtypes.float16:
+        tolerance = 0.002
+      elif data_type == dtypes.float32:
         tolerance = 0.002
       else:
         self.assertEqual(data_type, dtypes.float64)
@@ -369,6 +373,8 @@ class DepthwiseConv2DTest(test.TestCase):
       print("Testing DepthwiseConv2DInputGrad,", index, "th config:",
             input_size, "*", filter_size, "stride:", stride, "padding:",
             padding)
+      # Note: float16 test for DepthwiseConv2DInputGrad is not enabled,
+      # calculations are not very precise.
       for data_type in [dtypes.float32, dtypes.float64]:
         self._ConstructAndTestGradient(
             input_size,
@@ -389,6 +395,8 @@ class DepthwiseConv2DTest(test.TestCase):
       print("Testing DepthwiseConv2DInputGradFormat,", index, "th config:",
             input_size, "*", filter_size, "stride:", stride, "padding:",
             padding)
+      # Note: float16 test for DepthwiseConv2DInputGradFormat is not enabled,
+      # calculations are not very precise.
       for data_type in [dtypes.float32, dtypes.float64]:
         self._ConstructAndTestGradient(
             input_size,
@@ -407,6 +415,8 @@ class DepthwiseConv2DTest(test.TestCase):
       print("Testing DepthwiseConv2DFilterGrad,", index, "th config:",
             input_size, "*", filter_size, "stride:", stride, "padding:",
             padding)
+      # Note: float16 test for DepthwiseConv2DFilterGrad is not enabled,
+      # calculations are not very precise.
       for data_type in [dtypes.float32, dtypes.float64]:
         self._ConstructAndTestGradient(
             input_size,
@@ -427,6 +437,8 @@ class DepthwiseConv2DTest(test.TestCase):
       print("Testing DepthwiseConv2DFilterGradFormat,", index, "th config:",
             input_size, "*", filter_size, "stride:", stride, "padding:",
             padding)
+      # Note: float16 test for DepthwiseConv2DFilterGradFormat is not enabled,
+      # calculations are not very precise.
       for data_type in [dtypes.float32, dtypes.float64]:
         self._ConstructAndTestGradient(
             input_size,
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
index e21446c2ef..e220d05692 100644
--- a/tensorflow/python/kernel_tests/distributions/BUILD
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -193,6 +193,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    tags = ["manual"],  # b/69001419
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/kernel_tests/distributions/multinomial_test.py b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
index ebc89f15c5..d62aca151a 100644
--- a/tensorflow/python/kernel_tests/distributions/multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
@@ -250,13 +250,11 @@ class MultinomialTest(test.TestCase):
     theta = np.array([[1., 2, 3],
                       [2.5, 4, 0.01]], dtype=np.float32)
     theta /= np.sum(theta, 1)[..., array_ops.newaxis]
-    # Ideally we'd be able to test broadcasting but, the multinomial sampler
-    # doesn't support different total counts.
-    n = np.float32(5)
+    n = np.array([[10., 9.], [8., 7.], [6., 5.]], dtype=np.float32)
     with self.test_session() as sess:
-      # batch_shape=[2], event_shape=[3]
+      # batch_shape=[3, 2], event_shape=[3]
       dist = multinomial.Multinomial(n, theta)
-      x = dist.sample(int(250e3), seed=1)
+      x = dist.sample(int(1000e3), seed=1)
       sample_mean = math_ops.reduce_mean(x, 0)
       x_centered = x - sample_mean[array_ops.newaxis, ...]
       sample_cov = math_ops.reduce_mean(math_ops.matmul(
@@ -283,17 +281,17 @@ class MultinomialTest(test.TestCase):
           dist.variance(),
           dist.stddev(),
       ])
-      self.assertAllClose(sample_mean_, analytic_mean, atol=0.01, rtol=0.01)
-      self.assertAllClose(sample_cov_, analytic_cov, atol=0.01, rtol=0.01)
-      self.assertAllClose(sample_var_, analytic_var, atol=0.01, rtol=0.01)
-      self.assertAllClose(sample_stddev_, analytic_stddev, atol=0.01, rtol=0.01)
+      self.assertAllClose(sample_mean_, analytic_mean, atol=0., rtol=0.01)
+      self.assertAllClose(sample_cov_, analytic_cov, atol=0., rtol=0.01)
+      self.assertAllClose(sample_var_, analytic_var, atol=0., rtol=0.01)
+      self.assertAllClose(sample_stddev_, analytic_stddev, atol=0., rtol=0.01)
 
   def testSampleUnbiasedNonScalarBatch(self):
     with self.test_session() as sess:
       dist = multinomial.Multinomial(
-          total_count=5.,
+          total_count=[7., 6., 5.],
           logits=math_ops.log(2. * self._rng.rand(4, 3, 2).astype(np.float32)))
-      n = int(3e3)
+      n = int(3e4)
       x = dist.sample(n, seed=0)
       sample_mean = math_ops.reduce_mean(x, 0)
       # Cyclically rotate event dims left.
diff --git a/tensorflow/python/kernel_tests/dynamic_partition_op_test.py b/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
index 4883095707..2460950aa9 100644
--- a/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
@@ -33,8 +33,8 @@ from tensorflow.python.platform import test
 class DynamicPartitionTest(test.TestCase):
 
   def testSimpleOneDimensional(self):
-    with self.test_session() as sess:
-      data = constant_op.constant([0, 13, 2, 39, 4, 17])
+    with self.test_session(use_gpu=True) as sess:
+      data = constant_op.constant([0, 13, 2, 39, 4, 17], dtype=dtypes.float32)
       indices = constant_op.constant([0, 0, 2, 3, 2, 1])
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
@@ -52,9 +52,10 @@ class DynamicPartitionTest(test.TestCase):
     self.assertEqual([None], partitions[3].get_shape().as_list())
 
   def testSimpleTwoDimensional(self):
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       data = constant_op.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11],
-                                   [12, 13, 14], [15, 16, 17]])
+                                   [12, 13, 14], [15, 16, 17]],
+                                  dtype=dtypes.float32)
       indices = constant_op.constant([0, 0, 2, 3, 2, 1])
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
@@ -71,9 +72,61 @@ class DynamicPartitionTest(test.TestCase):
     self.assertEqual([None, 3], partitions[2].get_shape().as_list())
     self.assertEqual([None, 3], partitions[3].get_shape().as_list())
 
+  def testLargeOneDimensional(self):
+    num = 100000
+    data_list = [x for x in range(num)]
+    indices_list = [x % 2 for x in range(num)]
+    part1 = [x for x in range(num) if x % 2 == 0]
+    part2 = [x for x in range(num) if x % 2 == 1]
+    with self.test_session(use_gpu=True) as sess:
+      data = constant_op.constant(data_list, dtype=dtypes.float32)
+      indices = constant_op.constant(indices_list, dtype=dtypes.int32)
+      partitions = data_flow_ops.dynamic_partition(
+          data, indices, num_partitions=2)
+      partition_vals = sess.run(partitions)
+
+    self.assertAllEqual(part1, partition_vals[0])
+    self.assertAllEqual(part2, partition_vals[1])
+
+  def testLargeTwoDimensional(self):
+    rows = 100000
+    cols = 100
+    data_list = [None] * rows
+    for i in range(rows):
+      data_list[i] = [i for _ in range(cols)]
+    num_partitions = 97
+    indices_list = [(i ** 2) % num_partitions for i in range(rows)]
+    parts = [[] for _ in range(num_partitions)]
+    for i in range(rows):
+      parts[(i ** 2) % num_partitions].append(data_list[i])
+    with self.test_session(use_gpu=True) as sess:
+      data = constant_op.constant(data_list, dtype=dtypes.float32)
+      indices = constant_op.constant(indices_list, dtype=dtypes.int32)
+      partitions = data_flow_ops.dynamic_partition(
+          data, indices, num_partitions=num_partitions)
+      partition_vals = sess.run(partitions)
+
+    for i in range(num_partitions):
+      # reshape because of empty parts
+      parts_np = np.array(parts[i], dtype=np.float).reshape(-1, cols)
+      self.assertAllEqual(parts_np, partition_vals[i])
+
+  def testSimpleComplex(self):
+    data_list = [1 + 2j, 3 + 4j, 5 + 6j, 7 + 8j]
+    indices_list = [1, 0, 1, 0]
+    with self.test_session(use_gpu=True) as sess:
+      data = constant_op.constant(data_list, dtype=dtypes.complex64)
+      indices = constant_op.constant(indices_list, dtype=dtypes.int32)
+      partitions = data_flow_ops.dynamic_partition(
+          data, indices, num_partitions=2)
+      partition_vals = sess.run(partitions)
+
+    self.assertAllEqual([3 + 4j, 7 + 8j], partition_vals[0])
+    self.assertAllEqual([1 + 2j, 5 + 6j], partition_vals[1])
+
   def testHigherRank(self):
     np.random.seed(7)
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       for n in 2, 3:
         for shape in (4,), (4, 5), (4, 5, 2):
           partitions = np.random.randint(n, size=np.prod(shape)).reshape(shape)
@@ -95,6 +148,49 @@ class DynamicPartitionTest(test.TestCase):
             self.assertEqual(grads[1], None)  # Partitions has no gradients
             self.assertAllEqual(7 * data, sess.run(grads[0]))
 
+  def testEmptyParts(self):
+    data_list = [1, 2, 3, 4]
+    indices_list = [1, 3, 1, 3]
+    with self.test_session(use_gpu=True) as sess:
+      data = constant_op.constant(data_list, dtype=dtypes.float32)
+      indices = constant_op.constant(indices_list, dtype=dtypes.int32)
+      partitions = data_flow_ops.dynamic_partition(
+          data, indices, num_partitions=4)
+      partition_vals = sess.run(partitions)
+
+    self.assertAllEqual([], partition_vals[0])
+    self.assertAllEqual([1, 3], partition_vals[1])
+    self.assertAllEqual([], partition_vals[2])
+    self.assertAllEqual([2, 4], partition_vals[3])
+
+  def testEmptyDataTwoDimensional(self):
+    data_list = [[], []]
+    indices_list = [0, 1]
+    with self.test_session(use_gpu=True) as sess:
+      data = constant_op.constant(data_list, dtype=dtypes.float32)
+      indices = constant_op.constant(indices_list, dtype=dtypes.int32)
+      partitions = data_flow_ops.dynamic_partition(
+          data, indices, num_partitions=3)
+      partition_vals = sess.run(partitions)
+
+    self.assertAllEqual([[]], partition_vals[0])
+    self.assertAllEqual([[]], partition_vals[1])
+    self.assertAllEqual(np.array([], dtype=np.float).reshape(0, 0),
+                        partition_vals[2])
+
+  def testEmptyPartitions(self):
+    data_list = []
+    indices_list = []
+    with self.test_session(use_gpu=True) as sess:
+      data = constant_op.constant(data_list, dtype=dtypes.float32)
+      indices = constant_op.constant(indices_list, dtype=dtypes.int32)
+      partitions = data_flow_ops.dynamic_partition(
+          data, indices, num_partitions=2)
+      partition_vals = sess.run(partitions)
+
+    self.assertAllEqual([], partition_vals[0])
+    self.assertAllEqual([], partition_vals[1])
+
   def testErrorIndexOutOfRange(self):
     with self.test_session() as sess:
       data = constant_op.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11],
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index 5109ed98c9..af5e23c926 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -25,7 +25,6 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import variables
@@ -186,9 +185,6 @@ class GatherNdTest(test.TestCase):
     self.assertAllEqual(expected.reshape([10, 10, 20]), gather_nd_val)
     self.assertEqual([10, 10, 20], gather_nd_t.get_shape())
 
-  def assertIndexedSlices(self, t):
-    self.assertIsInstance(t, ops.IndexedSlices)
-
   def testUnknownIndices(self):
     params = constant_op.constant([[0, 1, 2]])
     indices = array_ops.placeholder(dtypes.int32)
@@ -237,8 +233,7 @@ class GatherNdTest(test.TestCase):
     grads = gradients_impl.gradients([outputs], [inputs], [grad_vals])[0]
     expected_grads = np.array([[3, 4], [1, 2]], dtype=np.float64)
     with self.test_session(use_gpu=True):
-      self.assertIndexedSlices(grads)
-      self.assertAllEqual(expected_grads, ops.convert_to_tensor(grads).eval())
+      self.assertAllEqual(expected_grads, grads.eval())
 
   def testGradientsRank3Elements(self):
     indices = constant_op.constant(
@@ -289,8 +284,7 @@ class GatherNdTest(test.TestCase):
          [0, 0, 0, 0, 0, 0, 0, 0, 0], [3, 3, 3, 3, 3, 3, 3, 3, 3]],
         dtype=np.float64)
     with self.test_session(use_gpu=True):
-      self.assertIndexedSlices(grads)
-      self.assertAllEqual(expected_grads, ops.convert_to_tensor(grads).eval())
+      self.assertAllEqual(expected_grads, grads.eval())
 
 
 class GatherNdOpBenchmark(test.Benchmark):
diff --git a/tensorflow/python/kernel_tests/iterator_ops_test.py b/tensorflow/python/kernel_tests/iterator_ops_test.py
index 2128ef4ae1..60a44b5b14 100644
--- a/tensorflow/python/kernel_tests/iterator_ops_test.py
+++ b/tensorflow/python/kernel_tests/iterator_ops_test.py
@@ -17,14 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.data.ops import readers
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -33,9 +31,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import script_ops
@@ -537,64 +533,6 @@ class IteratorTest(test.TestCase):
                 target_placeholder: "/job:localhost/replica:0/task:0/cpu:0"
             })
 
-  def testIncorrectIteratorRestore(self):
-
-    def _path():
-      return os.path.join(self.get_temp_dir(), "iterator")
-
-    def _save_op(iterator_resource):
-      iterator_state_variant = gen_dataset_ops.serialize_iterator(
-          iterator_resource)
-      save_op = io_ops.write_file(
-          _path(), parsing_ops.serialize_tensor(iterator_state_variant))
-      return save_op
-
-    def _restore_op(iterator_resource):
-      iterator_state_variant = parsing_ops.parse_tensor(
-          io_ops.read_file(_path()), dtypes.variant)
-      restore_op = gen_dataset_ops.deserialize_iterator(iterator_resource,
-                                                        iterator_state_variant)
-      return restore_op
-
-    def _build_range_dataset_graph():
-      start = 1
-      stop = 10
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = _save_op(iterator._iterator_resource)
-      restore_op = _restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    def _build_reader_dataset_graph():
-      filenames = ["test"]  # Does not exist but we don't care in this test.
-      iterator = readers.FixedLengthRecordDataset(
-          filenames, 1, 0, 0).make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next_op = iterator.get_next()
-      save_op = _save_op(iterator._iterator_resource)
-      restore_op = _restore_op(iterator._iterator_resource)
-      return init_op, get_next_op, save_op, restore_op
-
-    # Saving iterator for RangeDataset graph.
-    with ops.Graph().as_default() as g:
-      init_op, _, save_op, _ = _build_range_dataset_graph()
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        sess.run(save_op)
-
-    # Attempt to restore the saved iterator into an IteratorResource of
-    # incompatible type. An iterator of RangeDataset has output type int64,
-    # while an iterator of FixedLengthRecordDataset has output type string.
-    # So an InvalidArgumentError should be raised by
-    # IteratorResource::set_iterator.
-    with ops.Graph().as_default() as g:
-      _, _, _, restore_op = _build_reader_dataset_graph()
-      with self.test_session(graph=g) as sess:
-        with self.assertRaises(errors.InvalidArgumentError):
-          sess.run(restore_op)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index c699d50c02..988a72603f 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+import os
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -1341,11 +1342,33 @@ class PoolingTest(test.TestCase):
       return
 
     # Test the GPU implementation that uses cudnn for now.
-    # It does not propagate the diff in cases of NaNs
+    saved_nanprop = os.environ.get("TF_ENABLE_MAXPOOL_NANPROP")
+    # Do not propagate the diff in cases of NaNs
+    os.environ["TF_ENABLE_MAXPOOL_NANPROP"] = "0"
     expected_input_backprop_cudnn = [
         0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
         0.0, 0.0
     ]
+
+    for v2 in [True, False]:
+      self._testMaxPoolGradDirect(
+          input_data,
+          output_backprop,
+          expected_input_backprop_cudnn,
+          input_sizes=[1, 4, 4, 1],
+          output_sizes=[1, 3, 3, 1],
+          window_rows=2,
+          window_cols=2,
+          row_stride=1,
+          col_stride=1,
+          padding="VALID",
+          use_gpu=True,
+          v2=v2)
+
+    # Propagate the diff in cases of NaNs
+    os.environ["TF_ENABLE_MAXPOOL_NANPROP"] = "1"
+    expected_input_backprop_cudnn = expected_input_backprop_tf_cpu
+
     for v2 in [True, False]:
       self._testMaxPoolGradDirect(
           input_data,
@@ -1361,6 +1384,11 @@ class PoolingTest(test.TestCase):
           use_gpu=True,
           v2=v2)
 
+    if saved_nanprop:
+      os.environ["TF_ENABLE_MAXPOOL_NANPROP"] = saved_nanprop
+    else:
+      del os.environ["TF_ENABLE_MAXPOOL_NANPROP"]
+
   def _testMaxPoolGradDirectWithNans2_2(self):
     input_data = [float("nan")] * 16
     output_backprop = [
@@ -1391,11 +1419,14 @@ class PoolingTest(test.TestCase):
       return
 
     # Test the GPU implementation that uses cudnn for now.
-    # It does not propagate the diff in cases of NaNs
+    saved_nanprop = os.environ.get("TF_ENABLE_MAXPOOL_NANPROP")
+    # Do not propagate the diff in cases of NaNs
+    os.environ["TF_ENABLE_MAXPOOL_NANPROP"] = "0"
     expected_input_backprop_cudnn = [
         0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
         0.0, 0.0
     ]
+
     for v2 in [True, False]:
       self._testMaxPoolGradDirect(
           input_data,
@@ -1411,6 +1442,31 @@ class PoolingTest(test.TestCase):
           use_gpu=True,
           v2=v2)
 
+
+    # Propagate the diff in cases of NaNs
+    os.environ["TF_ENABLE_MAXPOOL_NANPROP"] = "1"
+    expected_input_backprop_cudnn = expected_input_backprop_tf_cpu
+
+    for v2 in [True, False]:
+      self._testMaxPoolGradDirect(
+          input_data,
+          output_backprop,
+          expected_input_backprop_cudnn,
+          input_sizes=[1, 4, 4, 1],
+          output_sizes=[1, 3, 3, 1],
+          window_rows=2,
+          window_cols=2,
+          row_stride=1,
+          col_stride=1,
+          padding="VALID",
+          use_gpu=True,
+          v2=v2)
+
+    if saved_nanprop:
+      os.environ["TF_ENABLE_MAXPOOL_NANPROP"] = saved_nanprop
+    else:
+      del os.environ["TF_ENABLE_MAXPOOL_NANPROP"]
+
   def testMaxPoolGradDirect(self):
     self._testMaxPoolGradDirect1_1()
     self._testMaxPoolGradDirect1_2()
diff --git a/tensorflow/python/kernel_tests/range_dataset_op_test.py b/tensorflow/python/kernel_tests/range_dataset_op_test.py
index 0c530522b8..3c1685c951 100644
--- a/tensorflow/python/kernel_tests/range_dataset_op_test.py
+++ b/tensorflow/python/kernel_tests/range_dataset_op_test.py
@@ -17,32 +17,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import io_ops
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 
 
 class RangeDatasetTest(test.TestCase):
 
-  def tearDown(self):
-    # Remove all checkpoint files.
-    prefix = self._iterator_checkpoint_prefix()
-    pattern = prefix + "*"
-    files = gfile.Glob(pattern)
-    map(gfile.Remove, files)
-
   def testStop(self):
     stop = array_ops.placeholder(dtypes.int64, shape=[])
     iterator = dataset_ops.Dataset.range(stop).make_initializable_iterator()
@@ -168,319 +151,6 @@ class RangeDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  def _iterator_checkpoint_prefix(self):
-    return os.path.join(self.get_temp_dir(), "iterator")
-
-  def _save_op(self, iterator_resource):
-    iterator_state_variant = gen_dataset_ops.serialize_iterator(
-        iterator_resource)
-    save_op = io_ops.write_file(
-        self._iterator_checkpoint_prefix(),
-        parsing_ops.serialize_tensor(iterator_state_variant))
-    return save_op
-
-  def _restore_op(self, iterator_resource):
-    iterator_state_variant = parsing_ops.parse_tensor(
-        io_ops.read_file(self._iterator_checkpoint_prefix()), dtypes.variant)
-    restore_op = gen_dataset_ops.deserialize_iterator(iterator_resource,
-                                                      iterator_state_variant)
-    return restore_op
-
-  def testSaveRestore(self):
-
-    def _build_graph(start, stop):
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = self._save_op(iterator._iterator_resource)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    # Saving and restoring in different sessions.
-    start = 2
-    stop = 10
-    break_point = 5
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, _ = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next, _, restore_op = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        sess.run(restore_op)
-        for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-    # Saving and restoring in same session.
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, restore_op = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-        sess.run(restore_op)
-        for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testRestoreWithoutBuildingDatasetGraph(self):
-
-    def _build_graph(start, stop, num_epochs):
-      dataset = dataset_ops.Dataset.range(start, stop).repeat(num_epochs)
-      iterator = dataset.make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = self._save_op(iterator._iterator_resource)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    # Saving and restoring in different sessions.
-    start = 2
-    stop = 10
-    num_epochs = 5
-    break_point = 5
-    break_epoch = 3
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, _ = _build_graph(start, stop, num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        for _ in range(break_epoch):
-          for i in range(start, stop):
-            self.assertEqual(i, sess.run(get_next))
-        for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      # Create an empty IteratorResource and restore the Iterator into it.
-      output_types = dtypes.int64
-      output_shapes = tensor_shape.scalar()
-      iterator = iterator_ops.Iterator.from_structure(output_types,
-                                                      output_shapes)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      get_next = iterator.get_next()
-      with self.test_session(graph=g) as sess:
-        sess.run(restore_op)
-        for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
-        for _ in range(break_epoch + 1, num_epochs):
-          for i in range(start, stop):
-            self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testRestoreInModifiedGraph(self):
-
-    def _build_graph(start, stop):
-      dataset = dataset_ops.Dataset.range(start, stop)
-      iterator = dataset.make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = self._save_op(iterator._iterator_resource)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    # Saving and restoring in different sessions.
-    start = 2
-    stop = 10
-    stop_1 = 8
-    break_point = 5
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, _ = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      # Intentionally build a graph with a different value for stop to make sure
-      # the original dataset graph is actually getting loaded.
-      init_op, get_next, _, restore_op = _build_graph(start, stop_1)
-      with self.test_session(graph=g) as sess:
-        sess.run(restore_op)
-        for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testInitThenRestore(self):
-    # Note: Calling init_op before restore_op is redundant. This test just makes
-    # sure we do not fail if restore is called on an already initialized
-    # iterator resource.
-
-    def _build_graph(start, stop):
-      dataset = dataset_ops.Dataset.range(start, stop)
-      iterator = dataset.make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = self._save_op(iterator._iterator_resource)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    # Saving and restoring in different sessions.
-    start = 2
-    stop = 10
-    break_point = 5
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, _ = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next, _, restore_op = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        sess.run(restore_op)
-        for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testMultipleSaves(self):
-
-    def _build_graph(start, stop):
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = self._save_op(iterator._iterator_resource)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    start = 2
-    stop = 10
-    break_point1 = 5
-    break_point2 = 7
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, _ = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        for i in range(start, break_point1):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, restore_op = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
-        sess.run(restore_op)
-        for i in range(break_point1, break_point2):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-
-    break_point2 = 7
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, restore_op = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
-        sess.run(restore_op)
-        for i in range(break_point2, stop):
-          self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testSaveRestoreWithRepeat(self):
-
-    def _build_graph(start, stop, num_epochs):
-      iterator = dataset_ops.Dataset.range(
-          start, stop).repeat(num_epochs).make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = self._save_op(iterator._iterator_resource)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    start = 2
-    stop = 10
-    num_epochs = 5
-    break_range = 5
-    break_epoch = 3
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, restore_op = _build_graph(
-          start, stop, num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for _ in range(break_epoch - 1):
-          for i in range(start, stop):
-            self.assertEqual(i, sess.run(get_next))
-        for i in range(start, break_range):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next, _, restore_op = _build_graph(start, stop, num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(restore_op)
-        for i in range(break_range, stop):
-          self.assertEqual(i, sess.run(get_next))
-        for _ in range(break_epoch, num_epochs):
-          for i in range(start, stop):
-            self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testSaveRestoreExhaustedIterator(self):
-
-    def _build_graph(start, stop, num_epochs):
-      iterator = dataset_ops.Dataset.range(
-          start, stop).repeat(num_epochs).make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = self._save_op(iterator._iterator_resource)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    start = 2
-    stop = 10
-    num_epochs = 5
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, restore_op = _build_graph(
-          start, stop, num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for _ in range(num_epochs):
-          for i in range(start, stop):
-            self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next, _, restore_op = _build_graph(start, stop, num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(restore_op)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/python/kernel_tests/reader_dataset_ops_test.py
index c8e7333b4b..70b6ce442e 100644
--- a/tensorflow/python/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/python/kernel_tests/reader_dataset_ops_test.py
@@ -26,13 +26,8 @@ from tensorflow.python.data.ops import readers
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import io_ops
-from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
@@ -272,299 +267,6 @@ class FixedLengthRecordReaderTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(iterator.get_next())
 
-  def _iterator_checkpoint_path(self):
-    return os.path.join(self.get_temp_dir(), "iterator")
-
-  def _save_op(self, iterator_resource):
-    iterator_state_variant = gen_dataset_ops.serialize_iterator(
-        iterator_resource)
-    save_op = io_ops.write_file(
-        self._iterator_checkpoint_path(),
-        parsing_ops.serialize_tensor(iterator_state_variant))
-    return save_op
-
-  def _restore_op(self, iterator_resource):
-    iterator_state_variant = parsing_ops.parse_tensor(
-        io_ops.read_file(self._iterator_checkpoint_path()), dtypes.variant)
-    restore_op = gen_dataset_ops.deserialize_iterator(iterator_resource,
-                                                      iterator_state_variant)
-    return restore_op
-
-  def _build_iterator_graph(self, num_epochs):
-    filenames = self._createFiles()
-    dataset = (readers.FixedLengthRecordDataset(
-        filenames, self._record_bytes, self._header_bytes, self._footer_bytes)
-               .repeat(num_epochs))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next_op = iterator.get_next()
-    save_op = self._save_op(iterator._iterator_resource)
-    restore_op = self._restore_op(iterator._iterator_resource)
-    return init_op, get_next_op, save_op, restore_op
-
-  def _restore_iterator(self):
-    output_types = dtypes.string
-    output_shapes = tensor_shape.scalar()
-    iterator = iterator_ops.Iterator.from_structure(output_types, output_shapes)
-    get_next = iterator.get_next()
-    restore_op = self._restore_op(iterator._iterator_resource)
-    return restore_op, get_next
-
-  def testSaveRestore(self):
-    num_epochs = 10
-    epoch_break = 5
-    file_break = self._num_files // 2
-    record_break = self._num_records // 2
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch == epoch_break and f == file_break and
-                  r == record_break):
-                sess.run(save_op)
-                break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-            else:
-              continue
-            break
-          else:
-            continue
-          break
-        else:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch < epoch_break or
-                  (epoch == epoch_break and f < file_break) or
-                  (epoch == epoch_break and f == file_break and
-                   r < record_break)):
-                continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testInitThenRestore(self):
-    # Note: Calling init_op before restore_op is redundant. This test just makes
-    # sure we do not fail if restore is called on an already initialized
-    # iterator resource.
-    num_epochs = 10
-    epoch_break = 5
-    file_break = self._num_files // 2
-    record_break = self._num_records // 2
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch == epoch_break and f == file_break and
-                  r == record_break):
-                sess.run(save_op)
-                break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-            else:
-              continue
-            break
-          else:
-            continue
-          break
-        else:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch < epoch_break or
-                  (epoch == epoch_break and f < file_break) or
-                  (epoch == epoch_break and f == file_break and
-                   r < record_break)):
-                continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testRestoreInModifiedGraph(self):
-    num_epochs = 10
-    num_epochs_1 = 20
-    epoch_break = 5
-    file_break = self._num_files // 2
-    record_break = self._num_records // 2
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch == epoch_break and f == file_break and
-                  r == record_break):
-                sess.run(save_op)
-                break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-            else:
-              continue
-            break
-          else:
-            continue
-          break
-        else:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs_1)
-      with self.test_session(graph=g) as sess:
-        sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch < epoch_break or
-                  (epoch == epoch_break and f < file_break) or
-                  (epoch == epoch_break and f == file_break and
-                   r < record_break)):
-                continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testRestoreWithoutBuildingDatasetGraph(self):
-    num_epochs = 10
-    epoch_break = 5
-    file_break = self._num_files // 2
-    record_break = self._num_records // 2
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch == epoch_break and f == file_break and
-                  r == record_break):
-                sess.run(save_op)
-                break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-            else:
-              continue
-            break
-          else:
-            continue
-          break
-        else:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    with ops.Graph().as_default() as g:
-      restore_op, get_next_op = self._restore_iterator()
-      with self.test_session(graph=g) as sess:
-        sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch < epoch_break or
-                  (epoch == epoch_break and f < file_break) or
-                  (epoch == epoch_break and f == file_break and
-                   r < record_break)):
-                continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testRestoreUnusedIterator(self):
-    num_epochs = 10
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        # Save unused iterator.
-        sess.run(save_op)
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(restore_op)
-        for _ in range(num_epochs * self._num_files * self._num_records):
-          sess.run(get_next_op)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testRestoreExhaustedIterator(self):
-    num_epochs = 10
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for _ in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(restore_op)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
 
 class TFRecordDatasetTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/reader_ops_test.py b/tensorflow/python/kernel_tests/reader_ops_test.py
index 5630259b7b..8e54d10f32 100644
--- a/tensorflow/python/kernel_tests/reader_ops_test.py
+++ b/tensorflow/python/kernel_tests/reader_ops_test.py
@@ -35,6 +35,9 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import coordinator
+from tensorflow.python.training import input as input_lib
+from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.util import compat
 
 prefix_path = "tensorflow/core/lib"
@@ -1011,6 +1014,25 @@ class LMDBReaderTest(test.TestCase):
                                     "\\(requested 1, current size 0\\)"):
         k, v = sess.run([key, value])
 
+  def testReadFromSameFile(self):
+    with self.test_session() as sess:
+      reader1 = io_ops.LMDBReader(name="test_read_from_same_file1")
+      reader2 = io_ops.LMDBReader(name="test_read_from_same_file2")
+      filename_queue = input_lib.string_input_producer([self.db_path],
+                                                       num_epochs=None)
+      key1, value1 = reader1.read(filename_queue)
+      key2, value2 = reader2.read(filename_queue)
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
+      for i in range(3):
+        for j in range(10):
+          k1, v1, k2, v2 = sess.run([key1, value1, key2, value2])
+          self.assertAllEqual(compat.as_bytes(k1), compat.as_bytes(k2))
+          self.assertAllEqual(compat.as_bytes(v1), compat.as_bytes(v2))
+      coord.request_stop()
+      coord.join(threads)
+
   def testReadFromFolder(self):
     with self.test_session() as sess:
       reader = io_ops.LMDBReader(name="test_read_from_folder")
@@ -1029,6 +1051,25 @@ class LMDBReaderTest(test.TestCase):
                                     "\\(requested 1, current size 0\\)"):
         k, v = sess.run([key, value])
 
+  def testReadFromFileRepeatedly(self):
+    with self.test_session() as sess:
+      reader = io_ops.LMDBReader(name="test_read_from_file_repeated")
+      filename_queue = input_lib.string_input_producer([self.db_path],
+                                                       num_epochs=None)
+      key, value = reader.read(filename_queue)
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
+      # Iterate over the lmdb 3 times.
+      for i in range(3):
+        # Go over all 10 records each time.
+        for j in range(10):
+          k, v = sess.run([key, value])
+          self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(j)))
+          self.assertAllEqual(
+              compat.as_bytes(v), compat.as_bytes(str(chr(ord("a") + j))))
+      coord.request_stop()
+      coord.join(threads)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 516a9d000e..3a02f24902 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -323,8 +323,9 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
   def testBadIndices(self):
     # Note: GPU kernel does not return the out-of-range error needed for this
     # test, so this test is marked as cpu-only.
+    # Note: With PR #13055 a negative index will be ignored silently.
     with self.test_session(use_gpu=False):
-      for bad in [[-1]], [[7]]:
+      for bad in [[2]], [[7]]:
         unsorted = math_ops.unsorted_segment_sum([[17]], bad, num_segments=2)
         with self.assertRaisesOpError(
             r"segment_ids\[0,0\] = %d is out of range \[0, 2\)" % bad[0][0]):
@@ -360,6 +361,32 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
             x_init_value=np_x.astype(np.double), delta=1)
       self.assertAllClose(jacob_t, jacob_n)
 
+  def testDropNegatives(self):
+    # Note: the test is done by replacing segment_ids with 8 to -1
+    # for index  and replace values generated by numpy with 0.
+    dtypes = [
+        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int64,
+        dtypes_lib.int32, dtypes_lib.complex64, dtypes_lib.complex128
+    ]
+    indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
+    num_segments = 12
+    for indices in indices_flat, indices_flat.reshape(5, 2):
+      shape = indices.shape + (2,)
+      for dtype in dtypes:
+        with self.test_session(use_gpu=True):
+          tf_x, np_x = self._input(shape, dtype=dtype)
+          np_ans = self._segmentReduce(
+              indices, np_x, np.add, op2=None, num_out_rows=num_segments)
+          # Replace np_ans[8] with 0 for the value
+          np_ans[8:] = 0
+          # Replace 8 with -1 in indices
+          np.place(indices, indices==8, [-1])
+          s = math_ops.unsorted_segment_sum(
+              data=tf_x, segment_ids=indices, num_segments=num_segments)
+          tf_ans = s.eval()
+        self.assertAllClose(np_ans, tf_ans)
+        self.assertShapeEqual(np_ans, s)
+
 
 class SparseSegmentReductionHelper(SegmentReductionHelper):
 
diff --git a/tensorflow/python/kernel_tests/shape_ops_test.py b/tensorflow/python/kernel_tests/shape_ops_test.py
index a9fc699b21..7368251ab6 100644
--- a/tensorflow/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/shape_ops_test.py
@@ -258,6 +258,16 @@ class ShapeOpsTest(test.TestCase):
       self.assertAllEqual([True], array_ops.expand_dims(inp, 0).eval())
       self.assertAllEqual([True], array_ops.expand_dims(inp, -1).eval())
 
+  def testExpandDimsDimType(self):
+    for dtype in [dtypes.int32, dtypes.int64]:
+      x = np.zeros([2])
+      np_ans = np.expand_dims(x, axis=0)
+      with self.test_session(use_gpu=True):
+        tensor = array_ops.expand_dims(x, constant_op.constant(0, dtype))
+        tf_ans = tensor.eval()
+      self.assertShapeEqual(np_ans, tensor)
+      self.assertAllEqual(np_ans, tf_ans)
+
   def _compareSqueeze(self, x, squeeze_dims, use_gpu):
     with self.test_session(use_gpu=use_gpu):
       if squeeze_dims:
diff --git a/tensorflow/python/kernel_tests/slice_op_test.py b/tensorflow/python/kernel_tests/slice_op_test.py
index 051a25080b..6cdc7872f9 100644
--- a/tensorflow/python/kernel_tests/slice_op_test.py
+++ b/tensorflow/python/kernel_tests/slice_op_test.py
@@ -217,6 +217,30 @@ class SliceTest(test.TestCase):
     self.assertEqual(expected_val.shape, slice_t.get_shape())
     self.assertEqual(expected_val.shape, slice2_t.get_shape())
 
+  def testRandomHighRank(self):
+    # Random dims of rank 8
+    input_shape = np.random.randint(0, 20, size=8)
+    inp = np.random.rand(*input_shape).astype("f")
+    with self.test_session(use_gpu=True) as sess:
+      a = constant_op.constant(
+          [float(x) for x in inp.ravel(order="C")],
+          shape=input_shape,
+          dtype=dtypes.float32)
+      indices = [0 if x == 0 else np.random.randint(x) for x in input_shape]
+      sizes = [
+          np.random.randint(0, input_shape[i] - indices[i] + 1)
+          for i in range(8)
+      ]
+      slice_t = array_ops.slice(a, indices, sizes)
+      slice_val = sess.run(slice_t)
+
+    expected_val = inp[indices[0]:indices[0] + sizes[0], indices[1]:indices[1] + sizes[
+      1], indices[2]:indices[2] + sizes[2], indices[3]:indices[3] + sizes[3], indices[
+        4]:indices[4] + sizes[4], indices[5]:indices[5] + sizes[5], indices[6]:indices[
+          6] + sizes[6], indices[7]:indices[7] + sizes[7]]
+    self.assertAllEqual(slice_val, expected_val)
+    self.assertEqual(expected_val.shape, slice_t.get_shape())
+
   def testPartialShapeInference(self):
     z = array_ops.zeros((1, 2, 3))
     self.assertAllEqual(z.get_shape().as_list(), [1, 2, 3])
@@ -227,7 +251,6 @@ class SliceTest(test.TestCase):
     m2 = array_ops.slice(z, [0, 0, 0], [constant_op.constant(1) + 0, 2, -1])
     self.assertAllEqual(m2.get_shape().as_list(), [None, 2, None])
 
-
   def _testGradientSlice(self, input_shape, slice_begin, slice_size):
     with self.test_session(use_gpu=True):
       num_inputs = np.prod(input_shape)
diff --git a/tensorflow/python/kernel_tests/unique_op_test.py b/tensorflow/python/kernel_tests/unique_op_test.py
index a50f53b3cd..04758ce45a 100644
--- a/tensorflow/python/kernel_tests/unique_op_test.py
+++ b/tensorflow/python/kernel_tests/unique_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.platform import test
 
 
@@ -61,6 +62,31 @@ class UniqueTest(test.TestCase):
     for i in range(len(x)):
       self.assertEqual(x[i], tf_y[tf_idx[i]].decode('ascii'))
 
+  def testInt32Axis(self):
+    x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
+    with self.test_session() as sess:
+      y0, idx0 = gen_array_ops.unique_v2(x, axis=[0])
+      tf_y0, tf_idx0 = sess.run([y0, idx0])
+      y1, idx1 = gen_array_ops.unique_v2(x, axis=[1])
+      tf_y1, tf_idx1 = sess.run([y1, idx1])
+    self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
+    self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
+    self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
+    self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
+
+  def testInt32V2(self):
+    # This test is only temporary, once V2 is used
+    # by default, the axis will be wrapped to allow `axis=None`.
+    x = np.random.randint(2, high=10, size=7000)
+    with self.test_session() as sess:
+      y, idx = gen_array_ops.unique_v2(x, axis=[])
+      tf_y, tf_idx = sess.run([y, idx])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+
 class UniqueWithCountsTest(test.TestCase):
 
   def testInt32(self):
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 5396214956..bd4b12b7e8 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -117,18 +117,6 @@ class VariableScopeTest(test.TestCase):
         w = variable_scope.get_variable("w", [])
         self.assertEqual(w.dtype.base_dtype, dtypes.float16)
 
-  def testEagerVaribleStore(self):
-    with context.eager_mode():
-      store = variable_scope.EagerVariableStore()
-      with store.as_default():
-        v = variable_scope.get_variable("v", shape=(), trainable=True)
-        w = variable_scope.get_variable("w", shape=(), trainable=False)
-
-      self.assertTrue(v in store.variables())
-      self.assertTrue(w in store.variables())
-      self.assertTrue(v in store.trainable_variables())
-      self.assertFalse(w in store.trainable_variables())
-
   @test_util.run_in_graph_and_eager_modes()
   def testInitFromNonTensorValue(self):
     v = variable_scope.get_variable("v4", initializer=4, dtype=dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py
index 43be08f8a1..4b3dadc112 100644
--- a/tensorflow/python/kernel_tests/xent_op_test.py
+++ b/tensorflow/python/kernel_tests/xent_op_test.py
@@ -181,24 +181,6 @@ class XentTest(test.TestCase):
     print("cross entropy gradient err = ", err)
     self.assertLess(err, 5e-8)
 
-  def testGradientLabelWithV2(self):
-    with self.test_session():
-      l = constant_op.constant(
-          [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.5],
-          shape=[3, 4],
-          dtype=dtypes.float64,
-          name="l")
-      f = constant_op.constant(
-          [0.1, 0.2, 0.3, 0.4, 0.1, 0.4, 0.9, 1.6, 0.1, 0.8, 2.7, 6.4],
-          shape=[3, 4],
-          dtype=dtypes.float64,
-          name="f")
-      x = nn_ops.softmax_cross_entropy_with_logits_v2(labels=l, logits=f,
-                                                      name="xent")
-      err = gradient_checker.compute_gradient_error(l, [3, 4], x, [3])
-
-    self.assertLess(err, 5e-8)
-
   def testSecondGradient(self):
     with self.test_session() as sess:
       l = constant_op.constant([0.0, 0.0, 1.0/3, 0.0,
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index c71e8382e9..db608aa79a 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -220,7 +220,7 @@ class Layer(object):
 
     Weight updates (for instance, the updates of the moving mean and variance
     in a BatchNormalization layer) may be dependent on the inputs passed
-    when calling a layer. Hence, when reusing a same layer on
+    when calling a layer. Hence, when reusing the same layer on
     different inputs `a` and `b`, some entries in `layer.updates` may be
     dependent on `a` and some on `b`. This method automatically keeps track
     of dependencies.
@@ -294,9 +294,9 @@ class Layer(object):
     """Add loss tensor(s), potentially dependent on layer inputs.
 
     Some losses (for instance, activity regularization losses) may be dependent
-    on the inputs passed when calling a layer. Hence, when reusing a same layer
-    on different inputs `a` and `b`, some entries in `layer.losses` may be
-    dependent on `a` and some on `b`. This method automatically keeps track
+    on the inputs passed when calling a layer. Hence, when reusing the same
+    layer on different inputs `a` and `b`, some entries in `layer.losses` may
+    be dependent on `a` and some on `b`. This method automatically keeps track
     of dependencies.
 
     The `get_losses_for` method allows to retrieve the losses relevant to a
@@ -401,11 +401,10 @@ class Layer(object):
     """
     return input_shape
 
-  def _make_unique_name(self, name_uid_map=None, avoid_names=None,
-                        namespace=''):
+  def _make_unique_name(self, name_uid_map=None, avoid_names=None):
     base_name = _to_snake_case(self.__class__.__name__)
     name = _unique_layer_name(base_name, name_uid_map=name_uid_map,
-                              avoid_names=avoid_names, namespace=namespace)
+                              avoid_names=avoid_names)
     return (name, base_name)
 
   def _set_scope(self, scope=None):
@@ -642,7 +641,7 @@ class Layer(object):
             for output in output_list:
               with ops.name_scope('ActivityRegularizer'):
                 activity_regularization = self._activity_regularizer(output)
-              self.add_loss(activity_regularization, inputs=inputs)
+              self.add_loss(activity_regularization)
 
         if not in_deferred_mode:
           # TODO(fchollet): consider how masking will work with deferred mode.
@@ -2371,7 +2370,7 @@ def _get_default_graph_uid_map():
   return name_uid_map
 
 
-def _unique_layer_name(name, name_uid_map=None, avoid_names=None, namespace=''):
+def _unique_layer_name(name, name_uid_map=None, avoid_names=None):
   """Makes a layer name (or arbitrary string) unique within a TensorFlow graph.
 
   Arguments:
@@ -2380,9 +2379,6 @@ def _unique_layer_name(name, name_uid_map=None, avoid_names=None, namespace=''):
       names. If None (default), uses a per-Graph dictionary.
     avoid_names: An optional set or dict with names which should not be used. If
       None (default) does not avoid any names.
-    namespace: Gets a name which is unique within the (graph, namespace). Layers
-      which are not Networks use a blank namespace and so get graph-global
-      names.
 
   Returns:
     Unique string name.
@@ -2400,7 +2396,6 @@ def _unique_layer_name(name, name_uid_map=None, avoid_names=None, namespace=''):
     avoid_names = set()
   proposed_name = None
   while proposed_name is None or proposed_name in avoid_names:
-    name_key = (namespace, name)
-    name_uid_map[name_key] += 1
-    proposed_name = name + '_' + str(name_uid_map[name_key])
+    name_uid_map[name] += 1
+    proposed_name = name + '_' + str(name_uid_map[name])
   return proposed_name
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index 509ad5a7af..71eff2f965 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -47,7 +47,7 @@ class BaseLayerTest(test.TestCase):
     self.assertEqual(layer.trainable_variables, [])
     self.assertEqual(layer.non_trainable_variables, [])
     if context.in_graph_mode():
-      # updates, losses only supported in GRAPH mode
+      # updates, losses only suppported in GRAPH mode
       self.assertEqual(layer.updates, [])
       self.assertEqual(layer.losses, [])
     self.assertEqual(layer.built, False)
@@ -574,13 +574,6 @@ class BaseLayerTest(test.TestCase):
       self.assertEqual(3, result['label'].numpy())
       self.assertEqual(4.0, result['logits'].numpy())
 
-  def testActivityRegularizer(self):
-    regularizer = math_ops.reduce_sum
-    layer = base_layers.Layer(activity_regularizer=regularizer)
-    x = array_ops.placeholder('int32')
-    layer.apply(x)
-    self.assertEqual(len(layer.get_losses_for(x)), 1)
-
 
 class NetworkTest(test.TestCase):
 
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index 0c7ce02835..8c327d7e27 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -813,6 +813,7 @@ def conv3d(inputs,
       bias_constraint=bias_constraint,
       trainable=trainable,
       name=name,
+      dtype=inputs.dtype.base_dtype,
       _reuse=reuse,
       _scope=name)
   return layer.apply(inputs)
@@ -1746,6 +1747,7 @@ def conv3d_transpose(inputs,
       bias_constraint=bias_constraint,
       trainable=trainable,
       name=name,
+      dtype=inputs.dtype.base_dtype,
       _reuse=reuse,
       _scope=name)
   return layer.apply(inputs)
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index a9d59b25a3..dc39e96f87 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -26,6 +26,7 @@ import numpy as np
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import base
@@ -236,6 +237,12 @@ class BatchNormalization(base.Layer):
         raise ValueError('Unsupported axis, fused batch norm only supports '
                          'axis == [1] or axis == [3]')
 
+    # Raise parameters of fp16 batch norm to fp32
+    if self.dtype == dtypes.float16:
+      param_dtype = dtypes.float32
+    else:
+      param_dtype = self.dtype or dtypes.float32
+
     axis_to_dim = {x: input_shape[x].value for x in self.axis}
     for x in axis_to_dim:
       if axis_to_dim[x] is None:
@@ -259,6 +266,7 @@ class BatchNormalization(base.Layer):
     if self.scale:
       self.gamma = self.add_variable(name='gamma',
                                      shape=param_shape,
+                                     dtype=param_dtype,
                                      initializer=self.gamma_initializer,
                                      regularizer=self.gamma_regularizer,
                                      constraint=self.gamma_constraint,
@@ -266,11 +274,14 @@ class BatchNormalization(base.Layer):
     else:
       self.gamma = None
       if self.fused:
-        self._gamma_const = array_ops.constant(1.0, shape=param_shape)
+        self._gamma_const = array_ops.constant(1.0,
+                                               dtype=param_dtype,
+                                               shape=param_shape)
 
     if self.center:
       self.beta = self.add_variable(name='beta',
                                     shape=param_shape,
+                                    dtype=param_dtype,
                                     initializer=self.beta_initializer,
                                     regularizer=self.beta_regularizer,
                                     constraint=self.beta_constraint,
@@ -278,7 +289,9 @@ class BatchNormalization(base.Layer):
     else:
       self.beta = None
       if self.fused:
-        self._beta_const = array_ops.constant(0.0, shape=param_shape)
+        self._beta_const = array_ops.constant(0.0,
+                                              dtype=param_dtype,
+                                              shape=param_shape)
 
     # Disable variable partitioning when creating the moving mean and variance
     try:
@@ -290,12 +303,14 @@ class BatchNormalization(base.Layer):
       self.moving_mean = self.add_variable(
           name='moving_mean',
           shape=param_shape,
+          dtype=param_dtype,
           initializer=self.moving_mean_initializer,
           trainable=False)
 
       self.moving_variance = self.add_variable(
           name='moving_variance',
           shape=param_shape,
+          dtype=param_dtype,
           initializer=self.moving_variance_initializer,
           trainable=False)
 
@@ -311,6 +326,7 @@ class BatchNormalization(base.Layer):
         def _renorm_variable(name, shape):
           var = self.add_variable(name=name,
                                   shape=shape,
+                                  dtype=param_dtype,
                                   initializer=init_ops.zeros_initializer(),
                                   trainable=False)
           return var
@@ -353,7 +369,6 @@ class BatchNormalization(base.Layer):
 
   def _fused_batch_norm(self, inputs, training):
     """Returns the output of fused batch norm."""
-    # TODO(reedwm): Add support for fp16 inputs.
     beta = self.beta if self.center else self._beta_const
     gamma = self.gamma if self.scale else self._gamma_const
 
@@ -749,6 +764,7 @@ def batch_normalization(inputs,
       virtual_batch_size=virtual_batch_size,
       adjustment=adjustment,
       name=name,
+      dtype=inputs.dtype.base_dtype,
       _reuse=reuse,
       _scope=name)
   return layer.apply(inputs, training=training)
diff --git a/tensorflow/python/layers/normalization_test.py b/tensorflow/python/layers/normalization_test.py
index 90ebdc8c86..b2876c58c2 100644
--- a/tensorflow/python/layers/normalization_test.py
+++ b/tensorflow/python/layers/normalization_test.py
@@ -68,11 +68,12 @@ class BNTest(test.TestCase):
              use_gpu,
              is_fused,
              restore=False,
-             freeze_mode=False):
+             freeze_mode=False,
+             dtype=dtypes.float32):
     ops.reset_default_graph()
     graph = ops.get_default_graph()
     with self.test_session(graph=graph, use_gpu=use_gpu) as sess:
-      image = array_ops.placeholder(dtype='float32', shape=shape)
+      image = array_ops.placeholder(dtype=dtype, shape=shape)
       loss, train_op, saver = self._simple_model(image, is_fused, freeze_mode)
       if restore:
         saver.restore(sess, checkpoint_path)
@@ -80,7 +81,7 @@ class BNTest(test.TestCase):
         sess.run(variables.global_variables_initializer())
       np.random.seed(0)
       for _ in range(2):
-        image_val = np.random.rand(*shape).astype(np.float32)
+        image_val = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
         sess.run([loss, train_op], feed_dict={image: image_val})
       if restore:
         all_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
@@ -90,15 +91,74 @@ class BNTest(test.TestCase):
         saver.save(sess, checkpoint_path)
 
   def _infer(self, checkpoint_path, image_val, shape, use_gpu, is_fused):
+    dtype = image_val.dtype
     ops.reset_default_graph()
     graph = ops.get_default_graph()
     with self.test_session(graph=graph, use_gpu=use_gpu) as sess:
-      image = array_ops.placeholder(dtype='float32', shape=shape)
+      image = array_ops.placeholder(dtype=dtype, shape=shape)
       loss, _, saver = self._simple_model(image, is_fused, True)
       saver.restore(sess, checkpoint_path)
       loss_val = sess.run(loss, feed_dict={image: image_val})
       return loss_val
 
+  def _trainEvalSequence(self,
+                         dtype,
+                         train1_use_gpu,
+                         train2_use_gpu,
+                         infer_use_gpu):
+    batch, height, width, input_channels = 2, 4, 5, 3
+    shape = [batch, height, width, input_channels]
+    checkpoint = os.path.join(self.get_temp_dir(), 'cp_%s_%s_%s_%s' %
+        (dtype, train1_use_gpu, train2_use_gpu, infer_use_gpu))
+
+    self._train(
+        checkpoint,
+        shape,
+        use_gpu=train1_use_gpu,
+        is_fused=True,
+        restore=False,
+        freeze_mode=False,
+        dtype=dtype)
+
+    train_vars = self._train(
+        checkpoint,
+        shape,
+        use_gpu=train2_use_gpu,
+        is_fused=True,
+        restore=True,
+        freeze_mode=False,
+        dtype=dtype)
+
+    np.random.seed(0)
+    image_val = np.random.rand(batch,
+                               height,
+                               width,
+                               input_channels).astype(dtype.as_numpy_dtype)
+    loss_val = self._infer(checkpoint, image_val, shape,
+                           use_gpu=infer_use_gpu, is_fused=True)
+
+    return train_vars, loss_val
+
+  def testHalfPrecision(self):
+    ref_vars, ref_loss = self._trainEvalSequence(dtype=dtypes.float32,
+                                                 train1_use_gpu=True,
+                                                 train2_use_gpu=True,
+                                                 infer_use_gpu=True)
+ 
+    self.assertEqual(len(ref_vars), 5)
+
+    for train1_use_gpu in [True, False]:
+      for train2_use_gpu in [True, False]:
+        for infer_use_gpu in [True, False]:
+          test_vars, test_loss = self._trainEvalSequence(dtypes.float16,
+                                                         train1_use_gpu,
+                                                         train2_use_gpu,
+                                                         infer_use_gpu)
+          self.assertEqual(len(test_vars), 5)
+          for test_var, ref_var in zip(test_vars, ref_vars):
+            self.assertAllClose(test_var, ref_var, rtol=1.e-3, atol=1.e-3)
+          self.assertAllClose(test_loss, ref_loss, rtol=1.e-3, atol=1.e-3)
+
   def _testCheckpoint(self, is_fused_checkpoint_a, is_fused_checkpoint_b,
                       use_gpu_checkpoint_a, use_gpu_checkpoint_b,
                       use_gpu_test_a, use_gpu_test_b, freeze_mode):
@@ -218,6 +278,36 @@ class BNTest(test.TestCase):
         ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES),
         bn.trainable_variables)
 
+  def testCreateFusedBNFloat16(self):
+    # Call layer.
+    bn = normalization_layers.BatchNormalization(axis=1, fused=True)
+    inputs = random_ops.random_uniform((5, 4, 3, 3),
+                                       seed=1,
+                                       dtype=dtypes.float16)
+    training = array_ops.placeholder(dtype='bool')
+    outputs = bn.apply(inputs, training=training)
+
+    # Verify shape.
+    self.assertListEqual(outputs.get_shape().as_list(), [5, 4, 3, 3])
+
+    # Verify layer attributes.
+    self.assertEqual(len(bn.updates), 2)
+    self.assertEqual(len(bn.variables), 4)
+    self.assertEqual(len(bn.trainable_variables), 2)
+    self.assertEqual(len(bn.non_trainable_variables), 2)
+    for var in bn.variables:
+      self.assertEqual(var.dtype, dtypes.float32_ref)
+
+    # Test that updates were created and added to UPDATE_OPS.
+    self.assertEqual(len(bn.updates), 2)
+    self.assertListEqual(
+        ops.get_collection(ops.GraphKeys.UPDATE_OPS), bn.updates)
+
+    # Test that weights were created and added to TRAINABLE_VARIABLES.
+    self.assertListEqual(
+        ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES),
+        bn.trainable_variables)
+
   def test3DInputAxis1(self):
     epsilon = 1e-3
     bn = normalization_layers.BatchNormalization(
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 87f8d14860..3c025881cb 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -460,11 +460,7 @@ def _GatherNdGrad(op, grad):
   ref = op.inputs[0]
   indices = op.inputs[1]
   ref_shape = array_ops.shape(ref, out_type=indices.dtype)
-  if indices.shape.ndims == 2 and indices.shape[-1].value == 1:
-    ref_grad = ops.IndexedSlices(grad, array_ops.squeeze(indices, axis=-1),
-                                 ref_shape)
-  else:
-    ref_grad = array_ops.scatter_nd(indices, grad, ref_shape)
+  ref_grad = array_ops.scatter_nd(indices, grad, ref_shape)
   return [ref_grad, None]
 
 
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 61bd41e7de..f5f1278bfd 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1136,7 +1136,7 @@ def concat(values, axis, name="concat"):
   return gen_array_ops._concat_v2(values=values, axis=axis, name=name)
 
 
-def boolean_mask(tensor, mask, name="boolean_mask"):
+def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
   """Apply boolean mask to tensor.  Numpy equivalent is `tensor[mask]`.
 
   ```python
@@ -1150,11 +1150,17 @@ def boolean_mask(tensor, mask, name="boolean_mask"):
   the first K dimensions of `tensor`'s shape.  We then have:
     `boolean_mask(tensor, mask)[i, j1,...,jd] = tensor[i1,...,iK,j1,...,jd]`
   where `(i1,...,iK)` is the ith `True` entry of `mask` (row-major order).
+  The `axis` could be used with `mask` to indicate the axis to mask from.
+  In that case, `axis + dim(mask) <= dim(tensor)` and `mask`'s shape must match
+  the first `axis + dim(mask)` dimensions of `tensor`'s shape.
 
   Args:
     tensor:  N-D tensor.
     mask:  K-D boolean tensor, K <= N and K must be known statically.
     name:  A name for this operation (optional).
+    axis:  A 0-D int Tensor representing the axis in `tensor` to mask from.
+      By default, axis is 0 which will mask from the first dimension. Otherwise
+      K + axis <= N.
 
   Returns:
     (N-K+1)-dimensional tensor populated by entries in `tensor` corresponding
@@ -1173,10 +1179,10 @@ def boolean_mask(tensor, mask, name="boolean_mask"):
   ```
   """
 
-  def _apply_mask_1d(reshaped_tensor, mask):
+  def _apply_mask_1d(reshaped_tensor, mask, axis=None):
     """Mask tensor along dimension 0 with a 1-D mask."""
     indices = squeeze(where(mask), squeeze_dims=[1])
-    return gather(reshaped_tensor, indices)
+    return gather(reshaped_tensor, indices, axis=axis)
 
   with ops.name_scope(name, values=[tensor, mask]):
     tensor = ops.convert_to_tensor(tensor, name="tensor")
@@ -1191,19 +1197,22 @@ def boolean_mask(tensor, mask, name="boolean_mask"):
       raise ValueError(
           "Number of mask dimensions must be specified, even if some dimensions"
           " are None.  E.g. shape=[None] is ok, but shape=None is not.")
-    shape_tensor[:ndims_mask].assert_is_compatible_with(shape_mask)
+    axis = 0 if axis is None else axis
+    shape_tensor[axis:axis+ndims_mask].assert_is_compatible_with(shape_mask)
 
-    leading_size = gen_math_ops._prod(shape(tensor)[:ndims_mask], [0])
+    leading_size = gen_math_ops._prod(shape(tensor)[axis:axis+ndims_mask], [0])
     tensor = reshape(tensor,
-                     concat([[leading_size],
-                             shape(tensor)[ndims_mask:]], 0))
-    first_dim = shape_tensor[:ndims_mask].num_elements()
+                     concat([shape(tensor)[:axis],
+                             [leading_size],
+                             shape(tensor)[axis+ndims_mask:]], 0))
+    first_dim = shape_tensor[axis:axis+ndims_mask].num_elements()
     tensor.set_shape(
-        tensor_shape.as_shape([first_dim])
-        .concatenate(shape_tensor[ndims_mask:]))
+        tensor_shape.as_shape(shape_tensor[:axis])
+        .concatenate([first_dim])
+        .concatenate(shape_tensor[axis+ndims_mask:]))
 
     mask = reshape(mask, [-1])
-    return _apply_mask_1d(tensor, mask)
+    return _apply_mask_1d(tensor, mask, axis)
 
 
 def sparse_mask(a, mask_indices, name=None):
@@ -1525,7 +1534,8 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True):
   Args:
     tensor: A `Tensor`.
     dtype: A type for the returned `Tensor`. Must be `float32`, `float64`,
-    `int8`, `int16`, `int32`, `int64`, `uint8`, `complex64`, or `complex128`.
+      `int8`, `uint8`, `int16`, `uint16`, int32`, `int64`,
+      `complex64`, `complex128` or `bool`.
     name: A name for the operation (optional).
     optimize: if true, attempt to statically determine the shape of 'tensor'
     and encode it as a constant.
@@ -1576,8 +1586,8 @@ def ones_like(tensor, dtype=None, name=None, optimize=True):
   Args:
     tensor: A `Tensor`.
     dtype: A type for the returned `Tensor`. Must be `float32`, `float64`,
-      `int8`, `int16`, `int32`, `int64`, `uint8`, `complex64`, `complex128` or
-      `bool`.
+      `int8`, `uint8`, `int16`, `uint16`, int32`, `int64`,
+      `complex64`, `complex128` or `bool`.
     name: A name for the operation (optional).
     optimize: if true, attempt to statically determine the shape of 'tensor'
     and encode it as a constant.
@@ -1653,8 +1663,6 @@ def placeholder(dtype, shape=None, name=None):
     print(sess.run(y, feed_dict={x: rand_array}))  # Will succeed.
   ```
 
-  @compatibility{eager} Placeholders are not compatible with eager execution.
-
   Args:
     dtype: The type of elements in the tensor to be fed.
     shape: The shape of the tensor to be fed (optional). If the shape is not
@@ -1664,14 +1672,7 @@ def placeholder(dtype, shape=None, name=None):
   Returns:
     A `Tensor` that may be used as a handle for feeding a value, but not
     evaluated directly.
-
-  Raises:
-    RuntimeError: if eager execution is enabled
   """
-  if context.in_eager_mode():
-    raise RuntimeError("tf.placeholder() is not compatible with "
-                       "eager execution.")
-
   return gen_array_ops._placeholder(dtype=dtype, shape=shape, name=name)
 
 
@@ -1715,8 +1716,6 @@ def sparse_placeholder(dtype, shape=None, name=None):
     print(sess.run(y, feed_dict={x: sp_value}))  # Will succeed.
   ```
 
-  @compatibility{eager} Placeholders are not compatible with eager execution.
-
   Args:
     dtype: The type of `values` elements in the tensor to be fed.
     shape: The shape of the tensor to be fed (optional). If the shape is not
@@ -1726,14 +1725,7 @@ def sparse_placeholder(dtype, shape=None, name=None):
   Returns:
     A `SparseTensor` that may be used as a handle for feeding a value, but not
     evaluated directly.
-
-  Raises:
-    RuntimeError: if eager execution is enabled
   """
-  if context.in_eager_mode():
-    raise RuntimeError("tf.placeholder() is not compatible with "
-                       "eager execution.")
-
   shape_name = (name + "/shape") if name is not None else None
   shape, rank = _normalize_sparse_shape(shape, shape_name)
   if shape is None:
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 7e509f72c1..ceee009104 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -48,7 +48,6 @@ import numpy as np
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
@@ -97,11 +96,10 @@ def _maybe_constant_value_string(t):
 
 
 def _assert_static(condition, data):
-  """Raises a InvalidArgumentError with as much information as possible."""
+  """Raises a static ValueError with as much information as possible."""
   if not condition:
     data_static = [_maybe_constant_value_string(x) for x in data]
-    raise errors.InvalidArgumentError(node_def=None, op=None,
-                                      message='\n'.join(data_static))
+    raise ValueError('\n'.join(data_static))
 
 
 def assert_proper_iterable(values):
@@ -305,60 +303,11 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
 
   Returns:
     Op that raises `InvalidArgumentError` if `x == y` is False.
-    @compatibility{eager} returns None
-
-  Raises:
-    InvalidArgumentError if the check can be performed immediately and
-    `x == y` is False. The check can be performed immediately during
-    eager execution or if `x` and `y` are statically known.
   """
   message = message or ''
   with ops.name_scope(name, 'assert_equal', [x, y, data]):
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y')
-
-    if context.in_eager_mode():
-      eq = math_ops.equal(x, y)
-      condition = math_ops.reduce_all(eq)
-      if not condition:
-        # Prepare a message with first elements of x and y
-        summary_msg = ''
-        if summarize:
-          # reshape((-1,)) is the fastest way to get a flat array view.
-          x_np = x.numpy().reshape((-1,))
-          y_np = y.numpy().reshape((-1,))
-          x_sum = min(x_np.size, summarize)
-          y_sum = min(y_np.size, summarize)
-          summary_msg = ('First %d elements of x:\n%s\n'
-                         'First %d elements of y:\n%s\n' %
-                         (x_sum, x_np[:x_sum],
-                          y_sum, y_np[:y_sum]))
-
-        # Get the values that actually differed and their indices
-        mask = math_ops.logical_not(eq)
-        indices = array_ops.where(mask)
-        indices_np = indices.numpy()
-        x_vals = array_ops.boolean_mask(x, mask)
-        y_vals = array_ops.boolean_mask(y, mask)
-        diff_to_print = 0
-        if summarize:
-          diff_to_print = min(summarize, indices_np.size)
-
-        raise errors.InvalidArgumentError(
-            node_def=None, op=None,
-            message=('%s\nCondition x == y did not hold.\n'
-                     'Indices of first %s different values:\n%s\n'
-                     'Corresponding x values:\n%s\n'
-                     'Corresponding y values:\n%s\n'
-                     '%s'
-                     %
-                     (message or '',
-                      diff_to_print, indices_np[:diff_to_print],
-                      x_vals.numpy().reshape((-1,))[:diff_to_print],
-                      y_vals.numpy().reshape((-1,))[:diff_to_print],
-                      summary_msg)))
-      return
-
     if data is None:
       data = [
           message,
@@ -407,19 +356,12 @@ def assert_none_equal(
   with ops.name_scope(name, 'assert_none_equal', [x, y, data]):
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y')
-    if context.in_eager_mode():
-      x_name = 'x'
-      y_name = 'y'
-    else:
-      x_name = x.name
-      y_name = y.name
-
     if data is None:
       data = [
           message,
-          'Condition x != y did not hold for every single element:',
-          'x (%s) = ' % x_name, x,
-          'y (%s) = ' % y_name, y
+          'Condition x != y did not hold for every single element:'
+          'x (%s) = ' % x.name, x,
+          'y (%s) = ' % y.name, y
       ]
     condition = math_ops.reduce_all(math_ops.not_equal(x, y))
     return control_flow_ops.Assert(condition, data, summarize=summarize)
@@ -455,18 +397,11 @@ def assert_less(x, y, data=None, summarize=None, message=None, name=None):
   with ops.name_scope(name, 'assert_less', [x, y, data]):
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y')
-    if context.in_eager_mode():
-      x_name = 'x'
-      y_name = 'y'
-    else:
-      x_name = x.name
-      y_name = y.name
-
     if data is None:
       data = [
           message,
-          'Condition x < y did not hold element-wise:',
-          'x (%s) = ' % x_name, x, 'y (%s) = ' % y_name, y
+          'Condition x < y did not hold element-wise:'
+          'x (%s) = ' % x.name, x, 'y (%s) = ' % y.name, y
       ]
     condition = math_ops.reduce_all(math_ops.less(x, y))
     return control_flow_ops.Assert(condition, data, summarize=summarize)
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 8afb079d20..10d8e01304 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -60,7 +60,6 @@ from tensorflow.core.protobuf import control_flow_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
@@ -87,29 +86,6 @@ from tensorflow.python.util import tf_should_use
 _basetuple = tuple
 
 
-def _summarize_eager(tensor, summarize=None):
-  """Returns a summarized string representation of eager `tensor`.
-
-  Args:
-    tensor: EagerTensor to summarize
-    summarize: Include these many first elements of `array`
-  """
-  # reshape((-1,)) is the fastest way to get a flat array view
-  if tensor._rank():  # pylint: disable=protected-access
-    flat = tensor.numpy().reshape((-1,))
-    lst = [str(x) for x in flat[:summarize]]
-    if len(lst) < flat.size:
-      lst.append("...")
-  else:
-    # tensor.numpy() returns a scalar for zero dimensional arrays
-    if summarize != 0:
-      lst = [str(tensor.numpy())]
-    else:
-      lst = []
-
-  return ", ".join(lst)
-
-
 # pylint: disable=protected-access
 
 
@@ -122,8 +98,7 @@ def Assert(condition, data, summarize=None, name=None):
   If `condition` evaluates to false, print the list of tensors in `data`.
   `summarize` determines how many entries of the tensors to print.
 
-  NOTE: In graph mode, to ensure that Assert executes, one usually attaches
-  a dependency:
+  NOTE: To ensure that Assert executes, one usually attaches a dependency:
 
   ```python
   # Ensure maximum element of x is smaller or equal to 1
@@ -142,21 +117,7 @@ def Assert(condition, data, summarize=None, name=None):
     assert_op: An `Operation` that, when executed, raises a
     `tf.errors.InvalidArgumentError` if `condition` is not true.
     @compatibility{eager} returns None.
-
-  Raises:
-    @compatibility{eager} `tf.errors.InvalidArgumentError` if `condition`
-    is not true
   """
-  if context.in_eager_mode():
-    if not condition:
-      xs = ops.convert_n_to_tensor(data)
-      data_str = [_summarize_eager(x, summarize) for x in xs]
-      raise errors.InvalidArgumentError(
-          node_def=None, op=None,
-          message="Expected '%s' to be true. Summarized data: %s" % (
-              condition, "\n".join(data_str)))
-    return
-
   with ops.name_scope(name, "Assert", [condition, data]) as name:
     xs = ops.convert_n_to_tensor(data)
     if all([x.dtype in {dtypes.string, dtypes.int32} for x in xs]):
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index f037767cf4..477c0d1cb4 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -22,8 +22,8 @@ from __future__ import print_function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_ctc_ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.nn_grad import _BroadcastMul
 
 
@@ -38,8 +38,7 @@ def ctc_loss(labels, inputs, sequence_length,
 
   [A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber.
   Connectionist Temporal Classification: Labeling Unsegmented Sequence Data
-  with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA,
-  pp. 369-376.](http://www.cs.toronto.edu/~graves/icml_2006.pdf)
+  with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA, pp. 369-376.](http://www.cs.toronto.edu/~graves/icml_2006.pdf)
 
   Input requirements:
 
@@ -109,9 +108,9 @@ def ctc_loss(labels, inputs, sequence_length,
       See `core/ops/ctc_ops.cc` for more details.
     inputs: 3-D `float` `Tensor`.
       If time_major == False, this will be a `Tensor` shaped:
-        `[batch_size, max_time, num_classes]`.
+        `[batch_size x max_time x num_classes]`.
       If time_major == True (default), this will be a `Tensor` shaped:
-        `[max_time, batch_size, num_classes]`.
+        `[max_time x batch_size x num_classes]`.
       The logits.
     sequence_length: 1-D `int32` vector, size `[batch_size]`.
       The sequence lengths.
@@ -121,18 +120,15 @@ def ctc_loss(labels, inputs, sequence_length,
     ignore_longer_outputs_than_inputs: Boolean. Default: False.
       If True, sequences with longer outputs than inputs will be ignored.
     time_major: The shape format of the `inputs` Tensors.
-      If True, these `Tensors` must be shaped `[max_time, batch_size,
-      num_classes]`.
-      If False, these `Tensors` must be shaped `[batch_size, max_time,
-      num_classes]`.
-      Using `time_major = True` (default) is a bit more efficient because it
-      avoids transposes at the beginning of the ctc_loss calculation.  However,
-      most TensorFlow data is batch-major, so by this function also accepts
-      inputs in batch-major form.
+      If True, these `Tensors` must be shaped `[max_time, batch_size, num_classes]`.
+      If False, these `Tensors` must be shaped `[batch_size, max_time, num_classes]`.
+      Using `time_major = True` (default) is a bit more efficient because it avoids
+      transposes at the beginning of the ctc_loss calculation.  However, most
+      TensorFlow data is batch-major, so by this function also accepts inputs
+      in batch-major form.
 
   Returns:
-    A 1-D `float` `Tensor`, size `[batch]`, containing the negative log
-      probabilities.
+    A 1-D `float` `Tensor`, size `[batch]`, containing the negative log probabilities.
 
   Raises:
     TypeError: if labels is not a `SparseTensor`.
@@ -202,7 +198,7 @@ def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
 
   Args:
     inputs: 3-D `float` `Tensor` sized
-      `[max_time, batch_size, num_classes]`.  The logits.
+      `[max_time x batch_size x num_classes]`.  The logits.
     sequence_length: 1-D `int32` vector containing sequence lengths,
       having size `[batch_size]`.
     merge_repeated: Boolean.  Default: True.
@@ -211,7 +207,7 @@ def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
     A tuple `(decoded, neg_sum_logits)` where
     decoded: A single-element list. `decoded[0]`
       is an `SparseTensor` containing the decoded outputs s.t.:
-      `decoded.indices`: Indices matrix `(total_decoded_outputs, 2)`.
+      `decoded.indices`: Indices matrix `(total_decoded_outputs x 2)`.
         The rows store: `[batch, time]`.
       `decoded.values`: Values vector, size `(total_decoded_outputs)`.
         The vector stores the decoded classes.
diff --git a/tensorflow/python/ops/distributions/dirichlet.py b/tensorflow/python/ops/distributions/dirichlet.py
index 923696a553..2accedf1b9 100644
--- a/tensorflow/python/ops/distributions/dirichlet.py
+++ b/tensorflow/python/ops/distributions/dirichlet.py
@@ -196,7 +196,7 @@ class Dirichlet(distribution.Distribution):
         alpha=self.concentration,
         dtype=self.dtype,
         seed=seed)
-    return gamma_sample / math_ops.reduce_sum(gamma_sample, -1, keep_dims=True)
+    return gamma_sample / math_ops.reduce_sum(gamma_sample, -1, keepdims=True)
 
   @distribution_util.AppendDocstring(_dirichlet_sample_note)
   def _log_prob(self, x):
diff --git a/tensorflow/python/ops/distributions/multinomial.py b/tensorflow/python/ops/distributions/multinomial.py
index 00b5697c83..d49fac59ca 100644
--- a/tensorflow/python/ops/distributions/multinomial.py
+++ b/tensorflow/python/ops/distributions/multinomial.py
@@ -26,6 +26,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import util as distribution_util
 
@@ -140,6 +141,8 @@ class Multinomial(distribution.Distribution):
 
   counts = [[2., 1, 1], [3, 1, 1]]
   dist.prob(counts)  # Shape [2]
+
+  dist.sample(5) # Shape [5, 2, 3]
   ```
   """
 
@@ -231,29 +234,35 @@ class Multinomial(distribution.Distribution):
 
   def _sample_n(self, n, seed=None):
     n_draws = math_ops.cast(self.total_count, dtype=dtypes.int32)
-    if self.total_count.get_shape().ndims is not None:
-      if self.total_count.get_shape().ndims != 0:
-        raise NotImplementedError(
-            "Sample only supported for scalar number of draws.")
-    elif self.validate_args:
-      is_scalar = check_ops.assert_rank(
-          n_draws, 0,
-          message="Sample only supported for scalar number of draws.")
-      n_draws = control_flow_ops.with_dependencies([is_scalar], n_draws)
     k = self.event_shape_tensor()[0]
-    # Flatten batch dims so logits has shape [B, k],
-    # where B = reduce_prod(self.batch_shape_tensor()).
-    x = random_ops.multinomial(
-        logits=array_ops.reshape(self.logits, [-1, k]),
-        num_samples=n * n_draws,
-        seed=seed)
-    x = array_ops.reshape(x, shape=[-1, n, n_draws])
-    x = math_ops.reduce_sum(array_ops.one_hot(x, depth=k),
-                            axis=-2)  # shape: [B, n, k]
+
+    # boardcast the total_count and logits to same shape
+    n_draws = array_ops.ones_like(
+        self.logits[..., 0], dtype=n_draws.dtype) * n_draws
+    logits = array_ops.ones_like(
+        n_draws[..., array_ops.newaxis], dtype=self.logits.dtype) * self.logits
+
+    # flatten the total_count and logits
+    flat_logits = array_ops.reshape(logits, [-1, k]) # [B1B2...Bm, k]
+    flat_ndraws = n * array_ops.reshape(n_draws, [-1]) # [B1B2...Bm]
+
+    # computes each total_count and logits situation by map_fn
+    def _sample_single(args):
+      logits, n_draw = args[0], args[1] # [K], []
+      x = random_ops.multinomial(logits[array_ops.newaxis, ...],
+                                 n_draw, seed) # [1, n*n_draw]
+      x = array_ops.reshape(x, shape=[n, -1]) # [n, n_draw]
+      x = math_ops.reduce_sum(array_ops.one_hot(x, depth=k), axis=-2) # [n, k]
+      return x
+    x = functional_ops.map_fn(_sample_single,
+                              [flat_logits, flat_ndraws],
+                              dtype=self.dtype) # [B1B2...Bm, n, k]
+
+    # reshape the results to proper shape
     x = array_ops.transpose(x, perm=[1, 0, 2])
     final_shape = array_ops.concat([[n], self.batch_shape_tensor(), [k]], 0)
-    x = array_ops.reshape(x, final_shape)
-    return math_ops.cast(x, self.dtype)
+    x = array_ops.reshape(x, final_shape) # [n, B1, B2,..., Bm, k]
+    return x
 
   @distribution_util.AppendDocstring(_multinomial_sample_note)
   def _log_prob(self, counts):
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index f4561d1a83..8c1ccc6840 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -191,9 +191,12 @@ def _embedding_lookup_and_transform(params,
             (flat_ids - extras) // ids_per_partition)
 
         # Emulate a conditional using a boolean indicator tensor
-        new_ids = array_ops.where(p_assignments < extras,
-                                  flat_ids % (ids_per_partition + 1),
-                                  (flat_ids - extras) % ids_per_partition)
+        is_in_first_extras_partitions = math_ops.cast(p_assignments < extras,
+                                                      flat_ids.dtype)
+        new_ids = (is_in_first_extras_partitions * (flat_ids %
+                                                    (ids_per_partition + 1)) +
+                   (1 - is_in_first_extras_partitions) *
+                   ((flat_ids - extras) % ids_per_partition))
       else:
         raise ValueError("Unrecognized partition strategy: " +
                          partition_strategy)
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 2946dbe81e..7c23321ca5 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -1121,7 +1121,7 @@ def rgb_to_grayscale(images, name=None):
     rank_1 = array_ops.expand_dims(array_ops.rank(images) - 1, 0)
     gray_float = math_ops.reduce_sum(flt_image * rgb_weights,
                                      rank_1,
-                                     keep_dims=True)
+                                     keepdims=True)
     gray_float.set_shape(images.get_shape()[:-1].concatenate([1]))
     return convert_image_dtype(gray_float, orig_dtype, name=name)
 
@@ -1212,26 +1212,7 @@ def adjust_hue(image, delta, name=None):
     orig_dtype = image.dtype
     flt_image = convert_image_dtype(image, dtypes.float32)
 
-    # TODO(zhengxq): we will switch to the fused version after we add a GPU
-    # kernel for that.
-    fused = os.environ.get('TF_ADJUST_HUE_FUSED', '')
-    fused = fused.lower() in ('true', 't', '1')
-
-    if not fused:
-      hsv = gen_image_ops.rgb_to_hsv(flt_image)
-
-      hue = array_ops.slice(hsv, [0, 0, 0], [-1, -1, 1])
-      saturation = array_ops.slice(hsv, [0, 0, 1], [-1, -1, 1])
-      value = array_ops.slice(hsv, [0, 0, 2], [-1, -1, 1])
-
-      # Note that we add 2*pi to guarantee that the resulting hue is a positive
-      # floating point number since delta is [-0.5, 0.5].
-      hue = math_ops.mod(hue + (delta + 1.), 1.)
-
-      hsv_altered = array_ops.concat([hue, saturation, value], 2)
-      rgb_altered = gen_image_ops.hsv_to_rgb(hsv_altered)
-    else:
-      rgb_altered = gen_image_ops.adjust_hue(flt_image, delta)
+    rgb_altered = gen_image_ops.adjust_hue(flt_image, delta)
 
     return convert_image_dtype(rgb_altered, orig_dtype)
 
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index 2cb467c891..14a039ffd0 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.gen_linalg_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import compat
+from tensorflow.python.util.deprecation import deprecated_args
 
 # Names below are lower_case.
 # pylint: disable=invalid-name
@@ -438,7 +439,10 @@ def svd(tensor, full_matrices=False, compute_uv=True, name=None):
 
 
 # pylint: disable=redefined-builtin
-def norm(tensor, ord='euclidean', axis=None, keep_dims=False, name=None):
+@deprecated_args(None, "keep_dims is deprecated, use keepdims instead",
+                 "keep_dims")
+def norm(tensor, ord='euclidean', axis=None, keepdims=None, name=None,
+         keep_dims=None):
   r"""Computes the norm of vectors, matrices, and tensors.
 
   This function can compute several different vector norms (the 1-norm, the
@@ -471,13 +475,13 @@ def norm(tensor, ord='euclidean', axis=None, keep_dims=False, name=None):
       can be either a matrix or a batch of matrices at runtime, pass
       `axis=[-2,-1]` instead of `axis=None` to make sure that matrix norms are
       computed.
-    keep_dims: If True, the axis indicated in `axis` are kept with size 1.
+    keepdims: If True, the axis indicated in `axis` are kept with size 1.
       Otherwise, the dimensions in `axis` are removed from the output shape.
     name: The name of the op.
 
   Returns:
     output: A `Tensor` of the same type as tensor, containing the vector or
-      matrix norms. If `keep_dims` is True then the rank of output is equal to
+      matrix norms. If `keepdims` is True then the rank of output is equal to
       the rank of `tensor`. Otherwise, if `axis` is none the output is a scalar,
       if `axis` is an integer, the rank of `output` is one less than the rank
       of `tensor`, if `axis` is a 2-tuple the rank of `output` is two less
@@ -497,6 +501,13 @@ def norm(tensor, ord='euclidean', axis=None, keep_dims=False, name=None):
   @end_compatibility
   """
 
+  if keep_dims is not None:
+    if keepdims is not None:
+      raise ValueError("Cannot specify both 'keep_dims' and 'keepdims'")
+    keepdims = keep_dims
+  if keepdims is None:
+    keepdims = False
+
   is_matrix_norm = ((isinstance(axis, tuple) or isinstance(axis, list)) and
                     len(axis) == 2)
   if is_matrix_norm:
@@ -528,25 +539,25 @@ def norm(tensor, ord='euclidean', axis=None, keep_dims=False, name=None):
       # matrices.
       result = math_ops.sqrt(
           math_ops.reduce_sum(
-              tensor * math_ops.conj(tensor), axis, keep_dims=True))
+              tensor * math_ops.conj(tensor), axis, keepdims=True))
     else:
       result = math_ops.abs(tensor)
       if ord == 1:
         sum_axis = None if axis is None else axis[0]
-        result = math_ops.reduce_sum(result, sum_axis, keep_dims=True)
+        result = math_ops.reduce_sum(result, sum_axis, keepdims=True)
         if is_matrix_norm:
-          result = math_ops.reduce_max(result, axis[-1], keep_dims=True)
+          result = math_ops.reduce_max(result, axis[-1], keepdims=True)
       elif ord == np.inf:
         if is_matrix_norm:
-          result = math_ops.reduce_sum(result, axis[1], keep_dims=True)
+          result = math_ops.reduce_sum(result, axis[1], keepdims=True)
         max_axis = None if axis is None else axis[0]
-        result = math_ops.reduce_max(result, max_axis, keep_dims=True)
+        result = math_ops.reduce_max(result, max_axis, keepdims=True)
       else:
         # General p-norms (positive p only)
         result = math_ops.pow(
             math_ops.reduce_sum(
-                math_ops.pow(result, ord), axis, keep_dims=True), 1.0 / ord)
-    if not keep_dims:
+                math_ops.pow(result, ord), axis, keepdims=True), 1.0 / ord)
+    if not keepdims:
       result = array_ops.squeeze(result, axis)
     return result
 
diff --git a/tensorflow/python/ops/math_grad_test.py b/tensorflow/python/ops/math_grad_test.py
index 5732c756ce..04eeb00518 100644
--- a/tensorflow/python/ops/math_grad_test.py
+++ b/tensorflow/python/ops/math_grad_test.py
@@ -113,6 +113,23 @@ class MinOrMaxGradientTest(test.TestCase):
       self.assertLess(error, 1e-4)
 
 
+class MaximumOrMinimumGradientTest(test.TestCase):
+
+  def testMaximumGradient(self):
+    inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0], dtype=dtypes.float32)
+    outputs = math_ops.maximum(inputs, 3.0)
+    with self.test_session():
+      error = gradient_checker.compute_gradient_error(inputs, [4], outputs, [4])
+      self.assertLess(error, 1e-4)
+
+  def testMinimumGradient(self):
+    inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0], dtype=dtypes.float32)
+    outputs = math_ops.minimum(inputs, 2.0)
+    with self.test_session():
+      error = gradient_checker.compute_gradient_error(inputs, [4], outputs, [4])
+      self.assertLess(error, 1e-4)
+
+
 class ProdGradientTest(test.TestCase):
 
   def testProdGradient(self):
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 886b2048f9..81b3c21808 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1265,16 +1265,19 @@ def _ReductionDims(x, axis, reduction_indices):
     return range(0, array_ops.rank(x))
 
 
+@deprecated_args(None, "keep_dims is deprecated, use keepdims instead",
+                 "keep_dims")
 def reduce_sum(input_tensor,
                axis=None,
-               keep_dims=False,
+               keepdims=None,
                name=None,
-               reduction_indices=None):
+               reduction_indices=None,
+               keep_dims=None):
   """Computes the sum of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
-  Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keep_dims` is true, the reduced dimensions
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
   If `axis` has no entries, all dimensions are reduced, and a
@@ -1287,7 +1290,7 @@ def reduce_sum(input_tensor,
   tf.reduce_sum(x)  # 6
   tf.reduce_sum(x, 0)  # [2, 2, 2]
   tf.reduce_sum(x, 1)  # [3, 3]
-  tf.reduce_sum(x, 1, keep_dims=True)  # [[3], [3]]
+  tf.reduce_sum(x, 1, keepdims=True)  # [[3], [3]]
   tf.reduce_sum(x, [0, 1])  # 6
   ```
 
@@ -1296,7 +1299,7 @@ def reduce_sum(input_tensor,
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
-    keep_dims: If true, retains reduced dimensions with length 1.
+    keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
 
@@ -1307,24 +1310,35 @@ def reduce_sum(input_tensor,
   Equivalent to np.sum
   @end_compatibility
   """
+
+  if keep_dims is not None:
+    if keepdims is not None:
+      raise ValueError("Cannot specify both 'keep_dims' and 'keepdims'")
+    keepdims = keep_dims
+  if keepdims is None:
+    keepdims = False
+
   return gen_math_ops._sum(
       input_tensor,
       _ReductionDims(input_tensor, axis, reduction_indices),
-      keep_dims,
+      keepdims,
       name=name)
 
 
+@deprecated_args(None, "keep_dims is deprecated, use keepdims instead",
+                 "keep_dims")
 def count_nonzero(input_tensor,
                   axis=None,
-                  keep_dims=False,
+                  keepdims=None,
                   dtype=dtypes.int64,
                   name=None,
-                  reduction_indices=None):
+                  reduction_indices=None,
+                  keep_dims=None):
   """Computes number of nonzero elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
-  Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keep_dims` is true, the reduced dimensions
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
   If `axis` has no entries, all dimensions are reduced, and a
@@ -1341,7 +1355,7 @@ def count_nonzero(input_tensor,
   tf.count_nonzero(x)  # 3
   tf.count_nonzero(x, 0)  # [1, 2, 0]
   tf.count_nonzero(x, 1)  # [1, 2]
-  tf.count_nonzero(x, 1, keep_dims=True)  # [[1], [2]]
+  tf.count_nonzero(x, 1, keepdims=True)  # [[1], [2]]
   tf.count_nonzero(x, [0, 1])  # 3
   ```
 
@@ -1350,7 +1364,7 @@ def count_nonzero(input_tensor,
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
-    keep_dims: If true, retains reduced dimensions with length 1.
+    keepdims: If true, retains reduced dimensions with length 1.
     dtype: The output dtype; defaults to `tf.int64`.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -1358,6 +1372,13 @@ def count_nonzero(input_tensor,
   Returns:
     The reduced tensor (number of nonzero values).
   """
+  if keep_dims is not None:
+    if keepdims is not None:
+      raise ValueError("Cannot specify both 'keep_dims' and 'keepdims'")
+    keepdims = keep_dims
+  if keepdims is None:
+    keepdims = False
+
   with ops.name_scope(name, "count_nonzero", [input_tensor]):
     input_tensor = ops.convert_to_tensor(input_tensor, name="input_tensor")
     zero = input_tensor.dtype.as_numpy_dtype()
@@ -1366,21 +1387,24 @@ def count_nonzero(input_tensor,
             # int64 reduction happens on GPU
             to_int64(gen_math_ops.not_equal(input_tensor, zero)),
             axis=axis,
-            keep_dims=keep_dims,
+            keepdims=keepdims,
             reduction_indices=reduction_indices),
         dtype=dtype)
 
 
+@deprecated_args(None, "keep_dims is deprecated, use keepdims instead",
+                 "keep_dims")
 def reduce_mean(input_tensor,
                 axis=None,
-                keep_dims=False,
+                keepdims=None,
                 name=None,
-                reduction_indices=None):
+                reduction_indices=None,
+                keep_dims=None):
   """Computes the mean of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
-  Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keep_dims` is true, the reduced dimensions
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
   If `axis` has no entries, all dimensions are reduced, and a
@@ -1400,7 +1424,7 @@ def reduce_mean(input_tensor,
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
-    keep_dims: If true, retains reduced dimensions with length 1.
+    keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
 
@@ -1409,25 +1433,44 @@ def reduce_mean(input_tensor,
 
   @compatibility(numpy)
   Equivalent to np.mean
+
+  Please note that `np.mean` has a `dtype` parameter that could be used to specify the output type. By default this is `dtype=float64`. On the other hand, `tf.reduce_mean` has an aggressive type inference from `input_tensor`, for example:
+
+  ```python
+  x = tf.constant([1, 0, 1, 0])
+  tf.reduce_mean(x)  # 0
+  y = tf.constant([1., 0., 1., 0.])
+  tf.reduce_mean(y)  # 0.5
+  ```
+
   @end_compatibility
   """
+  if keep_dims is not None:
+    if keepdims is not None:
+      raise ValueError("Cannot specify both 'keep_dims' and 'keepdims'")
+    keepdims = keep_dims
+  if keepdims is None:
+    keepdims = False
   return gen_math_ops._mean(
       input_tensor,
       _ReductionDims(input_tensor, axis, reduction_indices),
-      keep_dims,
+      keepdims,
       name=name)
 
 
+@deprecated_args(None, "keep_dims is deprecated, use keepdims instead",
+                 "keep_dims")
 def reduce_prod(input_tensor,
                 axis=None,
-                keep_dims=False,
+                keepdims=None,
                 name=None,
-                reduction_indices=None):
+                reduction_indices=None,
+                keep_dims=None):
   """Computes the product of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
-  Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keep_dims` is true, the reduced dimensions
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
   If `axis` has no entries, all dimensions are reduced, and a
@@ -1438,7 +1481,7 @@ def reduce_prod(input_tensor,
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
-    keep_dims: If true, retains reduced dimensions with length 1.
+    keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
 
@@ -1449,23 +1492,32 @@ def reduce_prod(input_tensor,
   Equivalent to np.prod
   @end_compatibility
   """
+  if keep_dims is not None:
+    if keepdims is not None:
+      raise ValueError("Cannot specify both 'keep_dims' and 'keepdims'")
+    keepdims = keep_dims
+  if keepdims is None:
+    keepdims = False
   return gen_math_ops._prod(
       input_tensor,
       _ReductionDims(input_tensor, axis, reduction_indices),
-      keep_dims,
+      keepdims,
       name=name)
 
 
+@deprecated_args(None, "keep_dims is deprecated, use keepdims instead",
+                 "keep_dims")
 def reduce_min(input_tensor,
                axis=None,
-               keep_dims=False,
+               keepdims=None,
                name=None,
-               reduction_indices=None):
+               reduction_indices=None,
+               keep_dims=None):
   """Computes the minimum of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
-  Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keep_dims` is true, the reduced dimensions
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
   If `axis` has no entries, all dimensions are reduced, and a
@@ -1476,7 +1528,7 @@ def reduce_min(input_tensor,
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
-    keep_dims: If true, retains reduced dimensions with length 1.
+    keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
 
@@ -1487,23 +1539,32 @@ def reduce_min(input_tensor,
   Equivalent to np.min
   @end_compatibility
   """
+  if keep_dims is not None:
+    if keepdims is not None:
+      raise ValueError("Cannot specify both 'keep_dims' and 'keepdims'")
+    keepdims = keep_dims
+  if keepdims is None:
+    keepdims = False
   return gen_math_ops._min(
       input_tensor,
       _ReductionDims(input_tensor, axis, reduction_indices),
-      keep_dims,
+      keepdims,
       name=name)
 
 
+@deprecated_args(None, "keep_dims is deprecated, use keepdims instead",
+                 "keep_dims")
 def reduce_max(input_tensor,
                axis=None,
-               keep_dims=False,
+               keepdims=None,
                name=None,
-               reduction_indices=None):
+               reduction_indices=None,
+               keep_dims=None):
   """Computes the maximum of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
-  Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keep_dims` is true, the reduced dimensions
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
   If `axis` has no entries, all dimensions are reduced, and a
@@ -1514,7 +1575,7 @@ def reduce_max(input_tensor,
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
-    keep_dims: If true, retains reduced dimensions with length 1.
+    keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
 
@@ -1525,23 +1586,32 @@ def reduce_max(input_tensor,
   Equivalent to np.max
   @end_compatibility
   """
+  if keep_dims is not None:
+    if keepdims is not None:
+      raise ValueError("Cannot specify both 'keep_dims' and 'keepdims'")
+    keepdims = keep_dims
+  if keepdims is None:
+    keepdims = False
   return gen_math_ops._max(
       input_tensor,
       _ReductionDims(input_tensor, axis, reduction_indices),
-      keep_dims,
+      keepdims,
       name=name)
 
 
+@deprecated_args(None, "keep_dims is deprecated, use keepdims instead",
+                 "keep_dims")
 def reduce_all(input_tensor,
                axis=None,
-               keep_dims=False,
+               keepdims=None,
                name=None,
-               reduction_indices=None):
+               reduction_indices=None,
+               keep_dims=None):
   """Computes the "logical and" of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
-  Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keep_dims` is true, the reduced dimensions
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
   If `axis` has no entries, all dimensions are reduced, and a
@@ -1561,7 +1631,7 @@ def reduce_all(input_tensor,
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
-    keep_dims: If true, retains reduced dimensions with length 1.
+    keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
 
@@ -1572,23 +1642,32 @@ def reduce_all(input_tensor,
   Equivalent to np.all
   @end_compatibility
   """
+  if keep_dims is not None:
+    if keepdims is not None:
+      raise ValueError("Cannot specify both 'keep_dims' and 'keepdims'")
+    keepdims = keep_dims
+  if keepdims is None:
+    keepdims = False
   return gen_math_ops._all(
       input_tensor,
       _ReductionDims(input_tensor, axis, reduction_indices),
-      keep_dims,
+      keepdims,
       name=name)
 
 
+@deprecated_args(None, "keep_dims is deprecated, use keepdims instead",
+                 "keep_dims")
 def reduce_any(input_tensor,
                axis=None,
-               keep_dims=False,
+               keepdims=None,
                name=None,
-               reduction_indices=None):
+               reduction_indices=None,
+               keep_dims=None):
   """Computes the "logical or" of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
-  Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keep_dims` is true, the reduced dimensions
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
   If `axis` has no entries, all dimensions are reduced, and a
@@ -1608,7 +1687,7 @@ def reduce_any(input_tensor,
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
-    keep_dims: If true, retains reduced dimensions with length 1.
+    keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
 
@@ -1619,23 +1698,32 @@ def reduce_any(input_tensor,
   Equivalent to np.any
   @end_compatibility
   """
+  if keep_dims is not None:
+    if keepdims is not None:
+      raise ValueError("Cannot specify both 'keep_dims' and 'keepdims'")
+    keepdims = keep_dims
+  if keepdims is None:
+    keepdims = False
   return gen_math_ops._any(
       input_tensor,
       _ReductionDims(input_tensor, axis, reduction_indices),
-      keep_dims,
+      keepdims,
       name=name)
 
 
+@deprecated_args(None, "keep_dims is deprecated, use keepdims instead",
+                 "keep_dims")
 def reduce_logsumexp(input_tensor,
                      axis=None,
-                     keep_dims=False,
+                     keepdims=None,
                      name=None,
-                     reduction_indices=None):
+                     reduction_indices=None,
+                     keep_dims=None):
   """Computes log(sum(exp(elements across dimensions of a tensor))).
 
   Reduces `input_tensor` along the dimensions given in `axis`.
-  Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keep_dims` is true, the reduced dimensions
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
   If `axis` has no entries, all dimensions are reduced, and a
@@ -1652,7 +1740,7 @@ def reduce_logsumexp(input_tensor,
   tf.reduce_logsumexp(x)  # log(6)
   tf.reduce_logsumexp(x, 0)  # [log(2), log(2), log(2)]
   tf.reduce_logsumexp(x, 1)  # [log(3), log(3)]
-  tf.reduce_logsumexp(x, 1, keep_dims=True)  # [[log(3)], [log(3)]]
+  tf.reduce_logsumexp(x, 1, keepdims=True)  # [[log(3)], [log(3)]]
   tf.reduce_logsumexp(x, [0, 1])  # log(6)
   ```
 
@@ -1661,19 +1749,25 @@ def reduce_logsumexp(input_tensor,
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
-    keep_dims: If true, retains reduced dimensions with length 1.
+    keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
 
   Returns:
     The reduced tensor.
   """
+  if keep_dims is not None:
+    if keepdims is not None:
+      raise ValueError("Cannot specify both 'keep_dims' and 'keepdims'")
+    keepdims = keep_dims
+  if keepdims is None:
+    keepdims = False
   with ops.name_scope(name, "ReduceLogSumExp", [input_tensor]) as name:
     raw_max = reduce_max(
         input_tensor,
         axis=axis,
         reduction_indices=reduction_indices,
-        keep_dims=True)
+        keepdims=True)
     my_max = array_ops.stop_gradient(
         array_ops.where(
             gen_math_ops.is_finite(raw_max),
@@ -1683,9 +1777,9 @@ def reduce_logsumexp(input_tensor,
         reduce_sum(
             gen_math_ops.exp(input_tensor - my_max),
             axis,
-            keep_dims=True,
+            keepdims=True,
             reduction_indices=reduction_indices)) + my_max
-    if not keep_dims:
+    if not keepdims:
       if isinstance(axis, int):
         axis = [axis]
       result = array_ops.squeeze(result, axis)
@@ -2191,8 +2285,10 @@ def bincount(arr,
     maxlength = ops.convert_to_tensor(
         maxlength, name="maxlength", dtype=dtypes.int32)
     output_size = gen_math_ops.minimum(maxlength, output_size)
-  weights = (ops.convert_to_tensor(weights, name="weights")
-             if weights is not None else constant_op.constant([], dtype))
+  if weights is not None:
+    weights = ops.convert_to_tensor(weights, name="weights")
+    return gen_math_ops.unsorted_segment_sum(weights, arr, output_size)
+  weights = constant_op.constant([], dtype)
   return gen_math_ops.bincount(arr, output_size, weights)
 
 
@@ -2355,7 +2451,7 @@ def reduced_shape(input_shape, axes):
     input_shape: 1-D Tensor, the shape of the Tensor being reduced.
     axes: 1-D Tensor, the reduction axes.
   Returns:
-    A 1-D Tensor, the output shape as if keep_dims were set to True.
+    A 1-D Tensor, the output shape as if keepdims were set to True.
   """
   # Example:
   # cast needed for SparseTensor reductions
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index 67caf72621..870c4f4062 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -794,7 +794,7 @@ def mean_cosine_distance(labels, predictions, dim, weights=None,
   radial_diffs = math_ops.multiply(predictions, labels)
   radial_diffs = math_ops.reduce_sum(radial_diffs,
                                      reduction_indices=[dim,],
-                                     keep_dims=True)
+                                     keepdims=True)
   mean_distance, update_op = mean(radial_diffs, weights,
                                   None,
                                   None,
diff --git a/tensorflow/python/ops/nn.py b/tensorflow/python/ops/nn.py
index ee1a00623a..79af3ac117 100644
--- a/tensorflow/python/ops/nn.py
+++ b/tensorflow/python/ops/nn.py
@@ -74,7 +74,6 @@ See the @{$python/nn} guide.
 @@softmax
 @@log_softmax
 @@softmax_cross_entropy_with_logits
-@@softmax_cross_entropy_with_logits_v2
 @@sparse_softmax_cross_entropy_with_logits
 @@weighted_cross_entropy_with_logits
 @@embedding_lookup
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index 1fcd0384da..e72d34d1f7 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -335,22 +335,22 @@ class BatchNormalizationTest(test.TestCase):
 
   def testInference(self):
     x_shape = [1, 1, 6, 1]
-    if test.is_gpu_available(cuda_only=True):
-      for dtype in [np.float16, np.float32]:
+    for dtype in [np.float16, np.float32]:
+      if test.is_gpu_available(cuda_only=True):
         self._test_inference(
             x_shape, dtype, [1], np.float32, use_gpu=True, data_format='NHWC')
         self._test_inference(
             x_shape, dtype, [1], np.float32, use_gpu=True, data_format='NCHW')
-    self._test_inference(
-        x_shape, np.float32, [1], np.float32, use_gpu=False, data_format='NHWC')
+      self._test_inference(
+          x_shape, dtype, [1], np.float32, use_gpu=False, data_format='NHWC')
 
     x_shape = [1, 1, 6, 2]
     if test.is_gpu_available(cuda_only=True):
       for dtype in [np.float16, np.float32]:
         self._test_inference(
             x_shape, dtype, [2], np.float32, use_gpu=True, data_format='NHWC')
-    self._test_inference(
-        x_shape, np.float32, [2], np.float32, use_gpu=False, data_format='NHWC')
+        self._test_inference(
+            x_shape, dtype, [2], np.float32, use_gpu=False, data_format='NHWC')
 
     x_shape = [1, 2, 1, 6]
     if test.is_gpu_available(cuda_only=True):
@@ -359,33 +359,33 @@ class BatchNormalizationTest(test.TestCase):
             x_shape, dtype, [2], np.float32, use_gpu=True, data_format='NCHW')
 
     x_shape = [27, 131, 127, 6]
-    if test.is_gpu_available(cuda_only=True):
-      for dtype in [np.float16, np.float32]:
+    for dtype in [np.float16, np.float32]:
+      if test.is_gpu_available(cuda_only=True):
         self._test_inference(
             x_shape, dtype, [131], np.float32, use_gpu=True, data_format='NCHW')
         self._test_inference(
             x_shape, dtype, [6], np.float32, use_gpu=True, data_format='NHWC')
-    self._test_inference(
-        x_shape, np.float32, [6], np.float32, use_gpu=False, data_format='NHWC')
+      self._test_inference(
+          x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
 
   def testTraining(self):
     x_shape = [1, 1, 6, 1]
-    if test.is_gpu_available(cuda_only=True):
-      for dtype in [np.float16, np.float32]:
+    for dtype in [np.float16, np.float32]:
+      if test.is_gpu_available(cuda_only=True):
         self._test_training(
             x_shape, dtype, [1], np.float32, use_gpu=True, data_format='NHWC')
         self._test_training(
             x_shape, dtype, [1], np.float32, use_gpu=True, data_format='NCHW')
-    self._test_training(
-        x_shape, np.float32, [1], np.float32, use_gpu=False, data_format='NHWC')
+      self._test_training(
+          x_shape, dtype, [1], np.float32, use_gpu=False, data_format='NHWC')
 
     x_shape = [1, 1, 6, 2]
-    if test.is_gpu_available(cuda_only=True):
-      for dtype in [np.float16, np.float32]:
+    for dtype in [np.float16, np.float32]:
+      if test.is_gpu_available(cuda_only=True):
         self._test_training(
             x_shape, dtype, [2], np.float32, use_gpu=True, data_format='NHWC')
-    self._test_training(
-        x_shape, np.float32, [2], np.float32, use_gpu=False, data_format='NHWC')
+      self._test_training(
+          x_shape, dtype, [2], np.float32, use_gpu=False, data_format='NHWC')
 
     x_shape = [1, 2, 1, 6]
     if test.is_gpu_available(cuda_only=True):
@@ -394,20 +394,20 @@ class BatchNormalizationTest(test.TestCase):
             x_shape, dtype, [2], np.float32, use_gpu=True, data_format='NCHW')
 
     x_shape = [27, 131, 127, 6]
-    if test.is_gpu_available(cuda_only=True):
-      for dtype in [np.float16, np.float32]:
+    for dtype in [np.float16, np.float32]:
+      if test.is_gpu_available(cuda_only=True):
         self._test_training(
             x_shape, dtype, [131], np.float32, use_gpu=True, data_format='NCHW')
         self._test_training(
             x_shape, dtype, [6], np.float32, use_gpu=True, data_format='NHWC')
-    self._test_training(
-        x_shape, np.float32, [6], np.float32, use_gpu=False, data_format='NHWC')
+      self._test_training(
+          x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
 
   def testBatchNormGrad(self):
     for is_training in [True, False]:
       x_shape = [1, 1, 6, 1]
-      if test.is_gpu_available(cuda_only=True):
-        for dtype in [np.float16, np.float32]:
+      for dtype in [np.float16, np.float32]:
+        if test.is_gpu_available(cuda_only=True):
           self._test_gradient(
               x_shape,
               dtype, [1],
@@ -422,17 +422,17 @@ class BatchNormalizationTest(test.TestCase):
               use_gpu=True,
               data_format='NCHW',
               is_training=is_training)
-      self._test_gradient(
-          x_shape,
-          np.float32, [1],
-          np.float32,
-          use_gpu=False,
-          data_format='NHWC',
-          is_training=is_training)
+        self._test_gradient(
+            x_shape,
+            dtype, [1],
+            np.float32,
+            use_gpu=False,
+            data_format='NHWC',
+            is_training=is_training)
 
       x_shape = [1, 1, 6, 2]
-      if test.is_gpu_available(cuda_only=True):
-        for dtype in [np.float16, np.float32]:
+      for dtype in [np.float16, np.float32]:
+        if test.is_gpu_available(cuda_only=True):
           self._test_gradient(
               x_shape,
               dtype, [2],
@@ -440,13 +440,13 @@ class BatchNormalizationTest(test.TestCase):
               use_gpu=True,
               data_format='NHWC',
               is_training=is_training)
-      self._test_gradient(
-          x_shape,
-          np.float32, [2],
-          np.float32,
-          use_gpu=False,
-          data_format='NHWC',
-          is_training=is_training)
+        self._test_gradient(
+            x_shape,
+            dtype, [2],
+            np.float32,
+            use_gpu=False,
+            data_format='NHWC',
+            is_training=is_training)
 
       x_shape = [1, 2, 1, 6]
       if test.is_gpu_available(cuda_only=True):
@@ -460,8 +460,8 @@ class BatchNormalizationTest(test.TestCase):
               is_training=is_training)
 
       x_shape = [5, 7, 11, 4]
-      if test.is_gpu_available(cuda_only=True):
-        for dtype in [np.float16, np.float32]:
+      for dtype in [np.float16, np.float32]:
+        if test.is_gpu_available(cuda_only=True):
           self._test_gradient(
               x_shape,
               dtype, [7],
@@ -476,13 +476,13 @@ class BatchNormalizationTest(test.TestCase):
               use_gpu=True,
               data_format='NHWC',
               is_training=is_training)
-      self._test_gradient(
-          x_shape,
-          np.float32, [4],
-          np.float32,
-          use_gpu=False,
-          data_format='NHWC',
-          is_training=is_training)
+        self._test_gradient(
+            x_shape,
+            dtype, [4],
+            np.float32,
+            use_gpu=False,
+            data_format='NHWC',
+            is_training=is_training)
 
   def _testBatchNormGradGrad(self, config):
     shape = config['shape']
@@ -506,15 +506,14 @@ class BatchNormalizationTest(test.TestCase):
             data_format='NCHW',
             is_training=is_training,
             err_tolerance=err_tolerance)
-      if dtype != np.float16:
-        self._test_grad_grad(
-            shape,
-            np.float32, [shape[3]],
-            np.float32,
-            use_gpu=False,
-            data_format='NHWC',
-            is_training=is_training,
-            err_tolerance=err_tolerance)
+      self._test_grad_grad(
+          shape,
+          dtype, [shape[3]],
+          np.float32,
+          use_gpu=False,
+          data_format='NHWC',
+          is_training=is_training,
+          err_tolerance=err_tolerance)
 
   def testBatchNormGradGrad(self):
     configs = [{
@@ -526,6 +525,10 @@ class BatchNormalizationTest(test.TestCase):
         'err_tolerance': 1e-3,
         'dtype': np.float32,
     }, {
+        'shape': [2, 3, 4, 5],
+        'err_tolerance': 1e-2,
+        'dtype': np.float16,
+    }, {
         'shape': [2, 3, 2, 2],
         'err_tolerance': 2e-3,
         'dtype': np.float16,
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 4b406ba840..557f39fb42 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -420,6 +420,7 @@ def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad):
   # grad_loss is the backprop for cost, and we multiply it with the gradients
   # (which is output[1])
   # grad_grad is the backprop for softmax gradient.
+  # There is no gradient for the labels
   #
   # Second derivative is just softmax derivative w.r.t. logits.
   softmax_grad = op.outputs[1]
@@ -435,15 +436,15 @@ def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad):
     const_fill_value = tensor_util.constant_value(g)
     return const_fill_value is not None and (const_fill_value == 0).all()
 
-  logits = op.inputs[0]
   if grad_grad is not None and not IsZero(grad_grad):
+    logits = op.inputs[0]
     softmax = nn_ops.softmax(logits)
 
     grad += ((grad_grad - array_ops.squeeze(
         math_ops.matmul(grad_grad[:, None, :],
                         softmax[:, :, None]), axis=1)) * softmax)
 
-  return grad, _BroadcastMul(grad_loss, -nn_ops.log_softmax(logits))
+  return grad, None
 
 
 @ops.RegisterGradient("SparseSoftmaxCrossEntropyWithLogits")
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 431ea1186a..7297d2f349 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -32,6 +32,8 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.deprecation import deprecated_argument_lookup
 
 
 def log_poisson_loss(targets, log_input, compute_full_loss=False, name=None):
@@ -275,6 +277,9 @@ def _swish_shape(op):
   return [op.inputs[0].shape]
 
 
+# Set noinline=True so that sigmoid(features) is re-computed during
+# backprop, and we can free the sigmoid(features) expression immediately
+# after use during the forward pass.
 @function.Defun(shape_func=_swish_shape, func_name="swish_grad", noinline=True)
 def _swish_grad(features, grad):
   """Gradient of Swish function defined below."""
@@ -284,11 +289,6 @@ def _swish_grad(features, grad):
   return grad * activation_grad
 
 
-# Naively, x * tf.nn.sigmoid(x) requires keeping both x and sigmoid(x) around
-# for backprop, effectively doubling the tensor's memory consumption. We use a
-# @Defun decorator with noinline=True so that sigmoid(features) is re-computed
-# during backprop, and we can free the sigmoid(features) expression immediately
-# after use during the forward pass.
 @function.Defun(
     grad_func=_swish_grad,
     shape_func=_swish_shape,
@@ -298,7 +298,7 @@ def swish(features):
   # pylint: disable=g-doc-args
   """Computes the Swish activation function: `x * sigmoid(x)`.
 
-  Source: "Searching for Activation Functions" (Ramachandran et al. 2017)
+  Source: "Swish: a Self-Gated Activation Function" (Ramachandran et al. 2017)
   https://arxiv.org/abs/1710.05941
 
   Args:
@@ -313,19 +313,20 @@ def swish(features):
   return features * math_ops.sigmoid(features)
 
 
-def l2_normalize(x, dim, epsilon=1e-12, name=None):
-  """Normalizes along dimension `dim` using an L2 norm.
+@deprecated_args(None, "dim is deprecated, use axis instead", "dim")
+def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
+  """Normalizes along dimension `axis` using an L2 norm.
 
-  For a 1-D tensor with `dim = 0`, computes
+  For a 1-D tensor with `axis = 0`, computes
 
       output = x / sqrt(max(sum(x**2), epsilon))
 
   For `x` with more dimensions, independently normalizes each 1-D slice along
-  dimension `dim`.
+  dimension `axis`.
 
   Args:
     x: A `Tensor`.
-    dim: Dimension along which to normalize.  A scalar or a vector of
+    axis: Dimension along which to normalize.  A scalar or a vector of
       integers.
     epsilon: A lower bound value for the norm. Will use `sqrt(epsilon)` as the
       divisor if `norm < sqrt(epsilon)`.
@@ -335,8 +336,9 @@ def l2_normalize(x, dim, epsilon=1e-12, name=None):
     A `Tensor` with the same shape as `x`.
   """
   with ops.name_scope(name, "l2_normalize", [x]) as name:
+    axis = deprecated_argument_lookup("axis", axis, "dim", dim)
     x = ops.convert_to_tensor(x, name="x")
-    square_sum = math_ops.reduce_sum(math_ops.square(x), dim, keep_dims=True)
+    square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keep_dims=True)
     x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))
     return math_ops.multiply(x, x_inv_norm, name=name)
 
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index bdaac65904..c4de2c7f00 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -23,6 +23,7 @@ import numbers
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import ops
@@ -32,13 +33,13 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_nn_ops import *
 # pylint: enable=wildcard-import
+from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.deprecation import deprecated_argument_lookup
 
-from tensorflow.python.util import deprecation
 
 # Aliases for some automatically-generated names.
 local_response_normalization = gen_nn_ops.lrn
@@ -1645,17 +1646,18 @@ def _softmax(logits, compute_op, dim=-1, name=None):
   return output
 
 
-def softmax(logits, dim=-1, name=None):
+@deprecated_args(None, "dim is deprecated, use axis instead", "dim")
+def softmax(logits, axis=None, name=None, dim=None):
   """Computes softmax activations.
 
   This function performs the equivalent of
 
-      softmax = tf.exp(logits) / tf.reduce_sum(tf.exp(logits), dim)
+      softmax = tf.exp(logits) / tf.reduce_sum(tf.exp(logits), axis)
 
   Args:
     logits: A non-empty `Tensor`. Must be one of the following types: `half`,
       `float32`, `float64`.
-    dim: The dimension softmax would be performed on. The default is -1 which
+    axis: The dimension softmax would be performed on. The default is -1 which
       indicates the last dimension.
     name: A name for the operation (optional).
 
@@ -1663,23 +1665,27 @@ def softmax(logits, dim=-1, name=None):
     A `Tensor`. Has the same type and shape as `logits`.
 
   Raises:
-    InvalidArgumentError: if `logits` is empty or `dim` is beyond the last
+    InvalidArgumentError: if `logits` is empty or `axis` is beyond the last
       dimension of `logits`.
   """
-  return _softmax(logits, gen_nn_ops._softmax, dim, name)
+  axis = deprecated_argument_lookup("axis", axis, "dim", dim)
+  if axis is None:
+    axis = -1
+  return _softmax(logits, gen_nn_ops._softmax, axis, name)
 
 
-def log_softmax(logits, dim=-1, name=None):
+@deprecated_args(None, "dim is deprecated, use axis instead", "dim")
+def log_softmax(logits, axis=None, name=None, dim=None):
   """Computes log softmax activations.
 
   For each batch `i` and class `j` we have
 
-      logsoftmax = logits - log(reduce_sum(exp(logits), dim))
+      logsoftmax = logits - log(reduce_sum(exp(logits), axis))
 
   Args:
     logits: A non-empty `Tensor`. Must be one of the following types: `half`,
       `float32`, `float64`.
-    dim: The dimension softmax would be performed on. The default is -1 which
+    axis: The dimension softmax would be performed on. The default is -1 which
       indicates the last dimension.
     name: A name for the operation (optional).
 
@@ -1687,10 +1693,13 @@ def log_softmax(logits, dim=-1, name=None):
     A `Tensor`. Has the same type as `logits`. Same shape as `logits`.
 
   Raises:
-    InvalidArgumentError: if `logits` is empty or `dim` is beyond the last
+    InvalidArgumentError: if `logits` is empty or `axis` is beyond the last
       dimension of `logits`.
   """
-  return _softmax(logits, gen_nn_ops._log_softmax, dim, name)
+  axis = deprecated_argument_lookup("axis", axis, "dim", dim)
+  if axis is None:
+    axis = -1
+  return _softmax(logits, gen_nn_ops._log_softmax, axis, name)
 
 
 def _ensure_xent_args(name, sentinel, labels, logits):
@@ -1702,9 +1711,9 @@ def _ensure_xent_args(name, sentinel, labels, logits):
     raise ValueError("Both labels and logits must be provided.")
 
 
-def softmax_cross_entropy_with_logits_v2(_sentinel=None,  # pylint: disable=invalid-name
-                                         labels=None, logits=None,
-                                         dim=-1, name=None):
+def softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid-name
+                                      labels=None, logits=None,
+                                      dim=-1, name=None):
   """Computes softmax cross entropy between `logits` and `labels`.
 
   Measures the probability error in discrete classification tasks in which the
@@ -1728,10 +1737,6 @@ def softmax_cross_entropy_with_logits_v2(_sentinel=None,  # pylint: disable=inva
   `[batch_size, num_classes]` and the same dtype (either `float16`, `float32`,
   or `float64`).
 
-  Backpropagation will happen into both `logits` and `labels`.  To disallow
-  backpropagation into `labels`, pass label tensors through a `stop_gradients`
-  before feeding it to this function.
-
   **Note that to avoid confusion, it is required to pass only named arguments to
   this function.**
 
@@ -1753,123 +1758,57 @@ def softmax_cross_entropy_with_logits_v2(_sentinel=None,  # pylint: disable=inva
   # could break users who call this with bad labels, but disregard the bad
   # results.
 
-  with ops.name_scope(
-      name, "softmax_cross_entropy_with_logits", [logits, labels]) as name:
-    logits = ops.convert_to_tensor(logits, name="logits")
-    labels = ops.convert_to_tensor(labels, name="labels")
-    precise_logits = math_ops.cast(logits, dtypes.float32) if (
-        logits.dtype == dtypes.float16) else logits
-    # labels and logits must be of the same type
-    labels = math_ops.cast(labels, precise_logits.dtype)
-    input_rank = array_ops.rank(precise_logits)
-    # For shape inference.
-    shape = logits.get_shape()
-
-    # Move the dim to the end if dim is not the last dimension.
-    if dim is not -1:
-      def _move_dim_to_end(tensor, dim_index, rank):
-        return array_ops.transpose(tensor,
-                                   array_ops.concat([
-                                       math_ops.range(dim_index),
-                                       math_ops.range(dim_index + 1, rank),
-                                       [dim_index]
-                                   ], 0))
-
-      precise_logits = _move_dim_to_end(precise_logits, dim, input_rank)
-      labels = _move_dim_to_end(labels, dim, input_rank)
-
-    input_shape = array_ops.shape(precise_logits)
-
-    # Make precise_logits and labels into matrices.
-    precise_logits = _flatten_outer_dims(precise_logits)
-    labels = _flatten_outer_dims(labels)
-
-    # Do the actual op computation.
-    # The second output tensor contains the gradients.  We use it in
-    # _CrossEntropyGrad() in nn_grad but not here.
-    cost, unused_backprop = gen_nn_ops._softmax_cross_entropy_with_logits(
-        precise_logits, labels, name=name)
-
-    # The output cost shape should be the input minus dim.
-    output_shape = array_ops.slice(input_shape, [0],
-                                   [math_ops.subtract(input_rank, 1)])
-    cost = array_ops.reshape(cost, output_shape)
-
-    # Make shape inference work since reshape and transpose may erase its static
-    # shape.
-    if context.in_graph_mode() and shape is not None and shape.dims is not None:
-      shape = shape.as_list()
-      del shape[dim]
-      cost.set_shape(shape)
-
-    if logits.dtype == dtypes.float16:
-      return math_ops.cast(cost, dtypes.float16)
-    else:
-      return cost
-
-
-_XENT_DEPRECATION = """
-Future major versions of TensorFlow will allow gradients to flow
-into the labels input on backprop by default.
-
-See tf.nn.softmax_cross_entropy_with_logits_v2.
-"""
-
-
-@deprecation.deprecated(date=None, instructions=_XENT_DEPRECATION)
-def softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid-name
-                                      labels=None, logits=None,
-                                      dim=-1, name=None):
-  """Computes softmax cross entropy between `logits` and `labels`.
-
-  Measures the probability error in discrete classification tasks in which the
-  classes are mutually exclusive (each entry is in exactly one class).  For
-  example, each CIFAR-10 image is labeled with one and only one label: an image
-  can be a dog or a truck, but not both.
-
-  **NOTE:**  While the classes are mutually exclusive, their probabilities
-  need not be.  All that is required is that each row of `labels` is
-  a valid probability distribution.  If they are not, the computation of the
-  gradient will be incorrect.
+  logits = ops.convert_to_tensor(logits)
+  labels = ops.convert_to_tensor(labels)
+  precise_logits = math_ops.cast(logits, dtypes.float32) if (
+      logits.dtype == dtypes.float16) else logits
+  # labels and logits must be of the same type
+  labels = math_ops.cast(labels, precise_logits.dtype)
+  input_rank = array_ops.rank(precise_logits)
+  # For shape inference.
+  shape = logits.get_shape()
 
-  If using exclusive `labels` (wherein one and only
-  one class is true at a time), see `sparse_softmax_cross_entropy_with_logits`.
+  # Move the dim to the end if dim is not the last dimension.
+  if dim is not -1:
+    def _move_dim_to_end(tensor, dim_index, rank):
+      return array_ops.transpose(tensor,
+                                 array_ops.concat([
+                                     math_ops.range(dim_index),
+                                     math_ops.range(dim_index + 1, rank),
+                                     [dim_index]
+                                 ], 0))
 
-  **WARNING:** This op expects unscaled logits, since it performs a `softmax`
-  on `logits` internally for efficiency.  Do not call this op with the
-  output of `softmax`, as it will produce incorrect results.
+    precise_logits = _move_dim_to_end(precise_logits, dim, input_rank)
+    labels = _move_dim_to_end(labels, dim, input_rank)
 
-  `logits` and `labels` must have the same shape, e.g.
-  `[batch_size, num_classes]` and the same dtype (either `float16`, `float32`,
-  or `float64`).
+  input_shape = array_ops.shape(precise_logits)
 
-  Backpropagation will happen only into `logits`.  To calculate a cross entropy
-  loss that allows backpropagation into both `logits` and `labels`, see
-  @{tf.nn.softmax_cross_entropy_with_logits_v2}.
+  # Make precise_logits and labels into matrices.
+  precise_logits = _flatten_outer_dims(precise_logits)
+  labels = _flatten_outer_dims(labels)
 
-  **Note that to avoid confusion, it is required to pass only named arguments to
-  this function.**
+  # Do the actual op computation.
+  # The second output tensor contains the gradients.  We use it in
+  # _CrossEntropyGrad() in nn_grad but not here.
+  cost, unused_backprop = gen_nn_ops._softmax_cross_entropy_with_logits(
+      precise_logits, labels, name=name)
 
-  Args:
-    _sentinel: Used to prevent positional parameters. Internal, do not use.
-    labels: Each row `labels[i]` must be a valid probability distribution.
-    logits: Unscaled log probabilities.
-    dim: The class dimension. Defaulted to -1 which is the last dimension.
-    name: A name for the operation (optional).
+  # The output cost shape should be the input minus dim.
+  output_shape = array_ops.slice(input_shape, [0],
+                                 [math_ops.subtract(input_rank, 1)])
+  cost = array_ops.reshape(cost, output_shape)
 
-  Returns:
-    A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
-    softmax cross entropy loss.
-  """
-  _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel,
-                    labels, logits)
-
-  with ops.name_scope(
-      name, "softmax_cross_entropy_with_logits_sg", [logits, labels]) as name:
-    labels = array_ops.stop_gradient(labels, name="labels_stop_gradient")
+  # Make shape inference work since reshape and transpose may erase its static
+  # shape.
+  if context.in_graph_mode() and shape is not None and shape.dims is not None:
+    shape = shape.as_list()
+    del shape[dim]
+    cost.set_shape(shape)
 
-  return softmax_cross_entropy_with_logits_v2(
-      labels=labels, logits=logits, dim=dim, name=name)
+  if logits.dtype == dtypes.float16:
+    return math_ops.cast(cost, dtypes.float16)
+  else:
+    return cost
 
 
 def sparse_softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid-name
@@ -2305,6 +2244,100 @@ def conv1d(value, filters, stride, padding,
     return array_ops.squeeze(result, [spatial_start_dim])
 
 
+def conv1d_transpose(value,
+                     filter,
+                     output_shape,
+                     stride,
+                     padding="SAME",
+                     data_format="NWC",
+                     name=None):
+  """The transpose of `conv1d`.
+
+  This operation is sometimes called "deconvolution" after [Deconvolutional
+  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
+  actually the transpose (gradient) of `conv1d` rather than an actual
+  deconvolution.
+
+  Args:
+    value: A 3-D `Tensor` of type `float` and shape
+      `[batch, in_width, in_channels]` for `NWC` data format or
+      `[batch, in_channels, in_width]` for `NCW` data format.
+    filter: A 3-D `Tensor` with the same type as `value` and shape
+      `[filter_width, output_channels, in_channels]`.  `filter`'s
+      `in_channels` dimension must match that of `value`.
+    output_shape: A 1-D `Tensor` representing the output shape of the
+      deconvolution op.
+    stride: An `integer`.  The number of entries by which
+      the filter is moved right at each step.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
+      See the @{tf.nn.convolution$comment here}
+    data_format: A string. 'NHWC' and 'NCHW' are supported.
+    name: Optional name for the returned tensor.
+
+  Returns:
+    A `Tensor` with the same type as `value`.
+
+  Raises:
+    ValueError: If input/output depth does not match `filter`'s shape, or if
+      padding is other than `'VALID'` or `'SAME'`.
+  """
+  with ops.name_scope(name, "conv1d_transpose",
+                      [value, filter, output_shape]) as name:
+    output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
+    if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(3)):
+      raise ValueError("output_shape must have shape (3,), got {}"
+                       .format(output_shape_.get_shape()))
+
+    # The format could be either NWC or NCW, map to NHWC or NCHW
+    if data_format is None or data_format == "NWC":
+      data_format_2d = "NHWC"
+      axis = 2
+    elif data_format == "NCW":
+      data_format_2d = "NCHW"
+      axis = 1
+    else:
+      raise ValueError("data_format must be \"NWC\" or \"NCW\".")
+
+    if not value.get_shape()[axis].is_compatible_with(filter.get_shape()[2]):
+      raise ValueError("input channels does not match filter's input channels, "
+                       "{} != {}".format(value.get_shape()[axis],
+                                         filter.get_shape()[2]))
+
+    if isinstance(output_shape, (list, np.ndarray)):
+      # output_shape's shape should be == [3] if reached this point.
+      if not filter.get_shape()[1].is_compatible_with(output_shape[axis]):
+        raise ValueError(
+            "output_shape does not match filter's output channels, "
+            "{} != {}".format(output_shape[axis], filter.get_shape()[1]))
+
+    if padding != "VALID" and padding != "SAME":
+      raise ValueError("padding must be either VALID or SAME:"
+                       " {}".format(padding))
+
+    # Reshape the input tensor to [batch, 1, in_width, in_channels]
+    if data_format_2d == "NHWC":
+      output_shape_ = array_ops.concat([output_shape_[:1], [1],
+                                        output_shape_[1:]], axis=0)
+      spatial_start_dim = 1
+      strides = [1, 1, stride, 1]
+    else:
+      output_shape_ = array_ops.concat([output_shape_[:2], [1],
+                                        output_shape_[2:]], axis=0)
+      spatial_start_dim = 2
+      strides = [1, 1, 1, stride]
+    value = array_ops.expand_dims(value, spatial_start_dim)
+    filter = array_ops.expand_dims(filter, 0)
+
+    result = gen_nn_ops.conv2d_backprop_input(input_sizes=output_shape_,
+                                              filter=filter,
+                                              out_backprop=value,
+                                              strides=strides,
+                                              padding=padding,
+                                              data_format=data_format_2d,
+                                              name=name)
+    return array_ops.squeeze(result, [spatial_start_dim])
+
+
 @ops.RegisterStatistics("Dilation2D", "flops")
 def _calc_dilation2d_flops(graph, node):
   """Calculates the compute resources needed for Dilation2D."""
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 9a0ff75594..92fa928eed 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -1227,11 +1227,6 @@ class EagerVariableStore(object):
   def variables(self):
     return self._store._vars.values()  # pylint: disable=protected-access
 
-  def trainable_variables(self):
-    # pylint: disable=protected-access
-    return [x for x in self._store._vars.values() if x._trainable]
-    # pylint: enable=protected-access
-
 
 def get_variable(name,
                  shape=None,
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index f906b7b3c4..eab7c3828f 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -1063,13 +1063,13 @@ class Variable(object):
 class PartitionedVariable(object):
   """A container for partitioned `Variable` objects.
 
-  @compatiblity(eager) `tf.PartitionedVariable` is not compatible with
+  @compatibility(eager) `tf.PartitionedVariable` is not compatible with
   eager execution.  Use `tfe.Variable` instead which is compatable
   with both eager execution and graph construction.  See [the
   TensorFlow Eager Execution
   guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/g3doc/guide.md#variables-and-optimizers)
   for details on how variables work in eager execution.
-  @end_compatiblity
+  @end_compatibility
   """
 
   class PartitionedVariableIterator(object):
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index cbacf458a0..637f738fed 100644
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -29,7 +29,7 @@ limitations under the License.
 %rename("%s") TFE_Py_TapeWatch;
 %rename("%s") TFE_Py_TapeDeleteTrace;
 %rename("%s") TFE_Py_TapeRecordOperation;
-%rename("%s") TFE_Py_TapeGradient;
+%rename("%s") TFE_Py_TapeExport;
 %rename("%s") TFE_NewContextOptions;
 %rename("%s") TFE_ContextOptionsSetConfig;
 %rename("%s") TFE_ContextOptionsSetDevicePlacementPolicy;
@@ -125,7 +125,7 @@ limitations under the License.
         SWIG_fail;
       }
       if (EagerTensor_CheckExact(elem)) {
-        (*$1)[i] = EagerTensor_Handle(elem);
+        (*$1)[i] = EagerTensorHandle(elem);
       } else {
         SWIG_exception_fail(SWIG_TypeError,
                             "provided list of inputs contains objects other "
diff --git a/tensorflow/python/tools/import_pb_to_tensorboard.py b/tensorflow/python/tools/import_pb_to_tensorboard.py
index 00de044505..00de044505 100644..100755
--- a/tensorflow/python/tools/import_pb_to_tensorboard.py
+++ b/tensorflow/python/tools/import_pb_to_tensorboard.py
diff --git a/tensorflow/python/tools/inspect_checkpoint.py b/tensorflow/python/tools/inspect_checkpoint.py
index 8716058e61..47a74e5abf 100644
--- a/tensorflow/python/tools/inspect_checkpoint.py
+++ b/tensorflow/python/tools/inspect_checkpoint.py
@@ -29,8 +29,7 @@ from tensorflow.python.platform import flags
 FLAGS = None
 
 
-def print_tensors_in_checkpoint_file(file_name, tensor_name, all_tensors,
-                                     all_tensor_names):
+def print_tensors_in_checkpoint_file(file_name, tensor_name, all_tensors):
   """Prints tensors in a checkpoint file.
 
   If no `tensor_name` is provided, prints the tensor names and shapes
@@ -42,16 +41,14 @@ def print_tensors_in_checkpoint_file(file_name, tensor_name, all_tensors,
     file_name: Name of the checkpoint file.
     tensor_name: Name of the tensor in the checkpoint file to print.
     all_tensors: Boolean indicating whether to print all tensors.
-    all_tensor_names: Boolean indicating whether to print all tensor names.
   """
   try:
     reader = pywrap_tensorflow.NewCheckpointReader(file_name)
-    if all_tensors or all_tensor_names:
+    if all_tensors:
       var_to_shape_map = reader.get_variable_to_shape_map()
       for key in sorted(var_to_shape_map):
         print("tensor_name: ", key)
-        if all_tensors:
-          print(reader.get_tensor(key))
+        print(reader.get_tensor(key))
     elif not tensor_name:
       print(reader.debug_string().decode("utf-8"))
     else:
@@ -107,14 +104,11 @@ def parse_numpy_printoption(kv_str):
 def main(unused_argv):
   if not FLAGS.file_name:
     print("Usage: inspect_checkpoint --file_name=checkpoint_file_name "
-          "[--tensor_name=tensor_to_print] "
-          "[--all_tensors] "
-          "[--all_tensor_names] "
-          "[--printoptions]")
+          "[--tensor_name=tensor_to_print]")
     sys.exit(1)
   else:
     print_tensors_in_checkpoint_file(FLAGS.file_name, FLAGS.tensor_name,
-                                     FLAGS.all_tensors, FLAGS.all_tensor_names)
+                                     FLAGS.all_tensors)
 
 
 if __name__ == "__main__":
@@ -137,13 +131,6 @@ if __name__ == "__main__":
       default=False,
       help="If True, print the values of all the tensors.")
   parser.add_argument(
-      "--all_tensor_names",
-      nargs="?",
-      const=True,
-      type="bool",
-      default=False,
-      help="If True, print the names of all the tensors.")
-  parser.add_argument(
       "--printoptions",
       nargs="*",
       type=parse_numpy_printoption,
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 1f6016a91b..af9f11bb07 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -536,7 +536,6 @@ class _MonitoredSession(object):
         will return True.
 
         Example usage:
-
         ```python
            with tf.Graph().as_default():
              c = tf.placeholder(dtypes.float32)
@@ -553,7 +552,6 @@ class _MonitoredSession(object):
                while not session.should_stop():
                  a = session.run_step_fn(step_fn)
         ```
-
         Hooks interact with the `run_with_hooks()` call inside the `step_fn`
         as they do with a `MonitoredSession.run` call.
 
diff --git a/tensorflow/python/util/tf_should_use.py b/tensorflow/python/util/tf_should_use.py
index 37733152e8..a576547d5f 100644
--- a/tensorflow/python/util/tf_should_use.py
+++ b/tensorflow/python/util/tf_should_use.py
@@ -44,7 +44,7 @@ def _add_should_use_warning(x, fatal_error=False):
     and is a very shallow wrapper for `x` which logs access into `x`.
   """
   del fatal_error
-  if x is None or x == []:  # pylint: disable=g-explicit-bool-comparison
+  if x is None:  # special corner case where x is None
     return x
 
   if context.in_eager_mode():
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index ad8164c7f9..2094061b44 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -232,7 +232,6 @@ CUDNN_DNN_ROUTINE_EACH_R3(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
   __macro(cudnnRNNBackwardData)                               \
   __macro(cudnnRNNBackwardWeights)                            \
   __macro(cudnnSetRNNDescriptor)                              \
-  __macro(cudnnSetRNNDescriptor_v6)                           \
   __macro(cudnnGetFilterNdDescriptor)
 
 // clang-format on
@@ -245,7 +244,8 @@ CUDNN_DNN_ROUTINE_EACH_R5(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 // clang-format off
 #if CUDNN_VERSION >= 6000
 #define CUDNN_DNN_ROUTINE_EACH_R6(__macro)                    \
-  __macro(cudnnConvolutionBiasActivationForward)
+  __macro(cudnnConvolutionBiasActivationForward)              \
+  __macro(cudnnSetRNNDescriptor_v6)
 
 // clang-format on
 CUDNN_DNN_ROUTINE_EACH_R6(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
@@ -390,8 +390,8 @@ port::Status CudnnSupport::Init() {
                  << DriverVersionStatusToString(result);
     } else {
       const auto& version = result.ValueOrDie();
-      LOG(ERROR) << "possibly insufficient driver version: "
-                 << DriverVersionToString(version);
+      LOG(INFO) << "possibly insufficient driver version: "
+                << DriverVersionToString(version);
       // OS X kernel driver does not report version accurately
 #if !defined(__APPLE__)
       if (std::get<0>(version) < 340) {
@@ -665,7 +665,6 @@ class ScopedPoolingDescriptor {
       LOG(FATAL) << "could not create cudnn pooling descriptor: "
                  << ToString(status);
     }
-
     const std::vector<int64> strides64 = pooling_descriptor.strides();
     const std::vector<int64> padding64 = pooling_descriptor.padding();
     const std::vector<int64> shape64 = pooling_descriptor.window();
@@ -680,14 +679,14 @@ class ScopedPoolingDescriptor {
                    &CheckedNarrowing<int64, int>);
     std::transform(shape64.cbegin(), shape64.cend(), shape.begin(),
                    &CheckedNarrowing<int64, int>);
+    bool propagate_nans = pooling_descriptor.propagate_nans();
     status = wrap::cudnnSetPoolingNdDescriptor(
         parent_, handle_,
         (pooling_descriptor.mode() == dnn::PoolingMode::kMaximum
              ? CUDNN_POOLING_MAX
              : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING),
 #if CUDNN_VERSION >= 5000
-        // Always propagate nans.
-        CUDNN_PROPAGATE_NAN,
+        propagate_nans ? CUDNN_PROPAGATE_NAN : CUDNN_NOT_PROPAGATE_NAN,
 #endif
         nd, shape.data(), padding.data(), strides.data());
     if (status != CUDNN_STATUS_SUCCESS) {
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index 07fe8a85f4..29fd6d0e87 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -472,7 +472,8 @@ PoolingDescriptor::PoolingDescriptor(int ndims)
       ndims_(ndims),
       window_(ndims, 0),
       padding_(ndims, 0),
-      strides_(ndims, 1) {}
+      strides_(ndims, 1),
+      propagate_nans_(false) {}
 
 PoolingDescriptor::PoolingDescriptor() : PoolingDescriptor(/*ndims=*/2) {}
 
@@ -482,6 +483,7 @@ void PoolingDescriptor::CloneFrom(const PoolingDescriptor& other) {
   window_ = other.window_;
   padding_ = other.padding_;
   strides_ = other.strides_;
+  propagate_nans_ = other.propagate_nans_;
 }
 
 string PoolingDescriptor::ToString() const {
@@ -495,9 +497,12 @@ string PoolingDescriptor::ToString() const {
     port::Appendf(&padding, "%lld", padding_[i]);
   }
 
-  return port::Printf("{mode: %s window: %s strides: %s padding: %s}",
-                      mode_string, window.c_str(), strides.c_str(),
-                      padding.c_str());
+  const char* propagate_string = propagate_nans_ ? "Yes" : "No";
+
+  return port::Printf(
+      "{mode: %s window: %s strides: %s padding: %s propagate NaNs: %s}",
+      mode_string, window.c_str(), strides.c_str(), padding.c_str(),
+      propagate_string);
 }
 
 string PoolingDescriptor::ToShortString() const {
@@ -508,7 +513,8 @@ string PoolingDescriptor::ToShortString() const {
     port::Appendf(&padding, "_p%d:%lld", i, padding_[i]);
   }
   return port::StrCat(mode_ == dnn::PoolingMode::kMaximum ? "max" : "avg",
-                      window, strides, padding);
+                      window, strides, padding,
+                      propagate_nans_ ? "propagate_nans" : "ignore_nans");
 }
 
 // -- NormalizeDescriptor
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 49235167ab..0d2cd4a9f2 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -661,6 +661,10 @@ class PoolingDescriptor {
     SetDim(&strides_, dim, value);
     return *this;
   }
+  PoolingDescriptor& set_propagate_nans(bool value) {
+    propagate_nans_ = value;
+    return *this;
+  }
 
   int ndims() const { return ndims_; }
   void CloneFrom(const PoolingDescriptor& other);
@@ -681,10 +685,12 @@ class PoolingDescriptor {
   std::vector<int64> window() const { return window_; }
   std::vector<int64> padding() const { return padding_; }
   std::vector<int64> strides() const { return strides_; }
+  bool propagate_nans() const { return propagate_nans_; }
 
  private:
   PoolingMode mode_;
   int ndims_;
+  bool propagate_nans_;
 
   // Stored as: ..., y, x.
   std::vector<int64> window_;
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 43ecb7f937..16c3386e15 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -172,8 +172,8 @@ def tf_copts():
       "-DEIGEN_AVOID_STL_ARRAY",
       "-Iexternal/gemmlowp",
       "-Wno-sign-compare",
-      "-fno-exceptions",
       "-ftemplate-depth=900",
+      "-fno-exceptions",
   ]) + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1", "-fopenmp",]) + if_android_arm(
       ["-mfpu=neon"]) + if_linux_x86_64(["-msse3"]) + select({
           clean_dep("//tensorflow:android"): [
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
deleted file mode 100644
index f5ed263f0e..0000000000
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
+++ /dev/null
@@ -1,54 +0,0 @@
-path: "tensorflow.estimator.BaselineClassifier"
-tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.baseline.BaselineClassifier\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "config"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "model_dir"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "model_fn"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "params"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\'], "
-  }
-  member_method {
-    name: "evaluate"
-    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_variable_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_variable_value"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "latest_checkpoint"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "predict"
-    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "train"
-    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
deleted file mode 100644
index 61a29942c5..0000000000
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
+++ /dev/null
@@ -1,54 +0,0 @@
-path: "tensorflow.estimator.BaselineRegressor"
-tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.baseline.BaselineRegressor\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "config"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "model_dir"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "model_fn"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "params"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\'], "
-  }
-  member_method {
-    name: "evaluate"
-    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_variable_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_variable_value"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "latest_checkpoint"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "predict"
-    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "train"
-    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
index cdc367b99e..ef93a61bd8 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
@@ -1,14 +1,6 @@
 path: "tensorflow.estimator"
 tf_module {
   member {
-    name: "BaselineClassifier"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "BaselineRegressor"
-    mtype: "<type \'type\'>"
-  }
-  member {
     name: "DNNClassifier"
     mtype: "<type \'type\'>"
   }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
deleted file mode 100644
index 763184899c..0000000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ /dev/null
@@ -1,179 +0,0 @@
-path: "tensorflow.keras.layers.GRUCell"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.GRUCell\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'states\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
index 889f2cbc23..9237399254 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -1,35 +1,15 @@
 path: "tensorflow.keras.layers.GRU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.GRU\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.Recurrent\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<type \'object\'>"
   member {
-    name: "activation"
-    mtype: "<type \'property\'>"
-  }
-  member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "bias_constraint"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "bias_initializer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "bias_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dropout"
-    mtype: "<type \'property\'>"
-  }
-  member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
@@ -38,10 +18,6 @@ tf_class {
     mtype: "<type \'property\'>"
   }
   member {
-    name: "implementation"
-    mtype: "<type \'property\'>"
-  }
-  member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
   }
@@ -58,18 +34,6 @@ tf_class {
     mtype: "<type \'property\'>"
   }
   member {
-    name: "kernel_constraint"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "kernel_initializer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "kernel_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
     name: "losses"
     mtype: "<type \'property\'>"
   }
@@ -102,34 +66,10 @@ tf_class {
     mtype: "<type \'property\'>"
   }
   member {
-    name: "recurrent_activation"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "recurrent_constraint"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "recurrent_dropout"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "recurrent_initializer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "recurrent_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "states"
-    mtype: "<type \'property\'>"
-  }
-  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
@@ -138,18 +78,10 @@ tf_class {
     mtype: "<type \'property\'>"
   }
   member {
-    name: "units"
-    mtype: "<type \'property\'>"
-  }
-  member {
     name: "updates"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "use_bias"
-    mtype: "<type \'property\'>"
-  }
-  member {
     name: "variables"
     mtype: "<type \'property\'>"
   }
@@ -159,7 +91,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\'], "
   }
   member_method {
     name: "add_loss"
@@ -206,6 +138,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_constants"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "get_initial_state"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
@@ -223,7 +159,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_output_at"
@@ -246,6 +182,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "preprocess_input"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
@@ -253,4 +193,8 @@ tf_class {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "step"
+    argspec: "args=[\'self\', \'inputs\', \'states\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
deleted file mode 100644
index 4ce7c34f6c..0000000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ /dev/null
@@ -1,179 +0,0 @@
-path: "tensorflow.keras.layers.LSTMCell"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.LSTMCell\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'states\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
index e1a1d0d58e..20935e2f99 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -1,35 +1,15 @@
 path: "tensorflow.keras.layers.LSTM"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.LSTM\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.Recurrent\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<type \'object\'>"
   member {
-    name: "activation"
-    mtype: "<type \'property\'>"
-  }
-  member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "bias_constraint"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "bias_initializer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "bias_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dropout"
-    mtype: "<type \'property\'>"
-  }
-  member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
@@ -38,10 +18,6 @@ tf_class {
     mtype: "<type \'property\'>"
   }
   member {
-    name: "implementation"
-    mtype: "<type \'property\'>"
-  }
-  member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
   }
@@ -58,18 +34,6 @@ tf_class {
     mtype: "<type \'property\'>"
   }
   member {
-    name: "kernel_constraint"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "kernel_initializer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "kernel_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
     name: "losses"
     mtype: "<type \'property\'>"
   }
@@ -102,34 +66,10 @@ tf_class {
     mtype: "<type \'property\'>"
   }
   member {
-    name: "recurrent_activation"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "recurrent_constraint"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "recurrent_dropout"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "recurrent_initializer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "recurrent_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "states"
-    mtype: "<type \'property\'>"
-  }
-  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
@@ -138,22 +78,10 @@ tf_class {
     mtype: "<type \'property\'>"
   }
   member {
-    name: "unit_forget_bias"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "units"
-    mtype: "<type \'property\'>"
-  }
-  member {
     name: "updates"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "use_bias"
-    mtype: "<type \'property\'>"
-  }
-  member {
     name: "variables"
     mtype: "<type \'property\'>"
   }
@@ -163,7 +91,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\'], "
   }
   member_method {
     name: "add_loss"
@@ -210,6 +138,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_constants"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "get_initial_state"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
@@ -227,7 +159,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_output_at"
@@ -250,6 +182,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "preprocess_input"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
@@ -257,4 +193,8 @@ tf_class {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "step"
+    argspec: "args=[\'self\', \'inputs\', \'states\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
deleted file mode 100644
index c7c9b10f22..0000000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
+++ /dev/null
@@ -1,191 +0,0 @@
-path: "tensorflow.keras.layers.RNN"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "states"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'cell\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\', \'activity_regularizer\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\', \'constants\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_initial_state"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
deleted file mode 100644
index 10c7f8867c..0000000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ /dev/null
@@ -1,179 +0,0 @@
-path: "tensorflow.keras.layers.SimpleRNNCell"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.SimpleRNNCell\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'states\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 588df21088..f4148fcc23 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -1,35 +1,15 @@
 path: "tensorflow.keras.layers.SimpleRNN"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.SimpleRNN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.Recurrent\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<type \'object\'>"
   member {
-    name: "activation"
-    mtype: "<type \'property\'>"
-  }
-  member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "bias_constraint"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "bias_initializer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "bias_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dropout"
-    mtype: "<type \'property\'>"
-  }
-  member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
@@ -54,18 +34,6 @@ tf_class {
     mtype: "<type \'property\'>"
   }
   member {
-    name: "kernel_constraint"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "kernel_initializer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "kernel_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
     name: "losses"
     mtype: "<type \'property\'>"
   }
@@ -98,30 +66,10 @@ tf_class {
     mtype: "<type \'property\'>"
   }
   member {
-    name: "recurrent_constraint"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "recurrent_dropout"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "recurrent_initializer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "recurrent_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "states"
-    mtype: "<type \'property\'>"
-  }
-  member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
   }
@@ -130,18 +78,10 @@ tf_class {
     mtype: "<type \'property\'>"
   }
   member {
-    name: "units"
-    mtype: "<type \'property\'>"
-  }
-  member {
     name: "updates"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "use_bias"
-    mtype: "<type \'property\'>"
-  }
-  member {
     name: "variables"
     mtype: "<type \'property\'>"
   }
@@ -151,7 +91,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\'], "
   }
   member_method {
     name: "add_loss"
@@ -198,6 +138,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_constants"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "get_initial_state"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
@@ -215,7 +159,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_output_at"
@@ -238,6 +182,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "preprocess_input"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
@@ -245,4 +193,8 @@ tf_class {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "step"
+    argspec: "args=[\'self\', \'inputs\', \'states\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
deleted file mode 100644
index 5779e41342..0000000000
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ /dev/null
@@ -1,183 +0,0 @@
-path: "tensorflow.keras.layers.StackedRNNCells"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.StackedRNNCells\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "state_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'cells\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'states\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
index fe336c4be5..8466c3e039 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
@@ -141,10 +141,6 @@ tf_module {
     mtype: "<type \'type\'>"
   }
   member {
-    name: "GRUCell"
-    mtype: "<type \'type\'>"
-  }
-  member {
     name: "GaussianDropout"
     mtype: "<type \'type\'>"
   }
@@ -213,10 +209,6 @@ tf_module {
     mtype: "<type \'type\'>"
   }
   member {
-    name: "LSTMCell"
-    mtype: "<type \'type\'>"
-  }
-  member {
     name: "Lambda"
     mtype: "<type \'type\'>"
   }
@@ -281,10 +273,6 @@ tf_module {
     mtype: "<type \'type\'>"
   }
   member {
-    name: "RNN"
-    mtype: "<type \'type\'>"
-  }
-  member {
     name: "RepeatVector"
     mtype: "<type \'type\'>"
   }
@@ -305,10 +293,6 @@ tf_module {
     mtype: "<type \'type\'>"
   }
   member {
-    name: "SimpleRNNCell"
-    mtype: "<type \'type\'>"
-  }
-  member {
     name: "SpatialDropout1D"
     mtype: "<type \'type\'>"
   }
@@ -321,10 +305,6 @@ tf_module {
     mtype: "<type \'type\'>"
   }
   member {
-    name: "StackedRNNCells"
-    mtype: "<type \'type\'>"
-  }
-  member {
     name: "ThresholdedReLU"
     mtype: "<type \'type\'>"
   }
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
index 9fd38a29b7..62e634afb8 100644
--- a/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
@@ -94,7 +94,7 @@ tf_module {
   }
   member_method {
     name: "norm"
-    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "qr"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
index 24c0448dea..1e9d28ca74 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
@@ -170,7 +170,7 @@ tf_module {
   }
   member_method {
     name: "l2_normalize"
-    argspec: "args=[\'x\', \'dim\', \'epsilon\', \'name\'], varargs=None, keywords=None, defaults=[\'1e-12\', \'None\'], "
+    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\', \'None\'], "
   }
   member_method {
     name: "leaky_relu"
@@ -190,7 +190,7 @@ tf_module {
   }
   member_method {
     name: "log_softmax"
-    argspec: "args=[\'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "log_uniform_candidate_sampler"
@@ -282,17 +282,13 @@ tf_module {
   }
   member_method {
     name: "softmax"
-    argspec: "args=[\'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "softmax_cross_entropy_with_logits"
     argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'-1\', \'None\'], "
   }
   member_method {
-    name: "softmax_cross_entropy_with_logits_v2"
-    argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'-1\', \'None\'], "
-  }
-  member_method {
     name: "softplus"
     argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index bf7bc6a7c1..0edd4153d7 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -750,7 +750,7 @@ tf_module {
   }
   member_method {
     name: "boolean_mask"
-    argspec: "args=[\'tensor\', \'mask\', \'name\'], varargs=None, keywords=None, defaults=[\'boolean_mask\'], "
+    argspec: "args=[\'tensor\', \'mask\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'boolean_mask\', \'None\'], "
   }
   member_method {
     name: "broadcast_dynamic_shape"
@@ -858,7 +858,7 @@ tf_module {
   }
   member_method {
     name: "count_nonzero"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'dtype\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \"<dtype: \'int64\'>\", \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'dtype\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int64\'>\", \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "count_up_to"
@@ -1414,7 +1414,7 @@ tf_module {
   }
   member_method {
     name: "norm"
-    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "not_equal"
@@ -1546,11 +1546,11 @@ tf_module {
   }
   member_method {
     name: "reduce_all"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "reduce_any"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "reduce_join"
@@ -1558,27 +1558,27 @@ tf_module {
   }
   member_method {
     name: "reduce_logsumexp"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "reduce_max"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "reduce_mean"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "reduce_min"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "reduce_prod"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "reduce_sum"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "register_tensor_conversion_function"
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 8d4e4c23dc..f1c207f9b6 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -98,8 +98,7 @@ do_pylint() {
 "^tensorflow/contrib/eager/python/evaluator\.py.*\[E0202.*method-hidden "\
 "^tensorflow/contrib/eager/python/metrics_impl\.py.*\[E0202.*method-hidden "\
 "^tensorflow/python/platform/gfile\.py.*\[E0301.*non-iterator "\
-"^tensorflow/python/keras/_impl/keras/callbacks\.py.*\[E1133.*not-an-iterable "\
-"^tensorflow/python/keras/_impl/keras/layers/recurrent\.py.*\[E0203.*access-member-before-definition"
+"^tensorflow/python/keras/_impl/keras/callbacks\.py.*\[E1133.*not-an-iterable"
 
   echo "ERROR_WHITELIST=\"${ERROR_WHITELIST}\""
 
diff --git a/tensorflow/tools/ci_build/install/install_golang.sh b/tensorflow/tools/ci_build/install/install_golang.sh
index 55c1674495..e1edd62cc5 100755
--- a/tensorflow/tools/ci_build/install/install_golang.sh
+++ b/tensorflow/tools/ci_build/install/install_golang.sh
@@ -16,7 +16,7 @@
 
 set -ex
 
-GOLANG_URL="https://storage.googleapis.com/golang/go1.9.1.linux-amd64.tar.gz"
+GOLANG_URL="https://storage.googleapis.com/golang/go1.9.2.linux-amd64.tar.gz"
 
 sudo mkdir -p /usr/local
 wget -q -O - "${GOLANG_URL}" | sudo tar -C /usr/local -xz
diff --git a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
index dcda8228bc..e5d8303c6e 100755
--- a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
+++ b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
@@ -48,6 +48,6 @@ ${DOCKER_BINARY} run \
   -e "TF_NEED_GCP=0" \
   -e "TF_NEED_HDFS=0" \
   -e "TF_NEED_CUDA=${TF_NEED_CUDA}" \
-  -e "TF_NEED_OPENCL=0" \
+  -e "TF_NEED_OPENCL_SYCL=0" \
   "${DOCKER_IMAGE}" \
   "/workspace/tensorflow/tools/ci_build/linux/libtensorflow.sh"
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
index d90a1b905d..e1b56b9a25 100755
--- a/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
@@ -27,7 +27,7 @@ export PYTHON_BIN_PATH="/usr/bin/python"
 export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
 export TF_NEED_CUDA=0
-export TF_NEED_OPENCL=0
+export TF_NEED_OPENCL_SYCL=0
 export TF_NEED_MKL=0
 export COMPUTECPP_PATH="/usr/local"
 
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
index 79973647c1..5a901af3e5 100755
--- a/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
@@ -28,7 +28,7 @@ export LD_LIBRARY_PATH="/usr/local/cuda/lib:/usr/local/cuda/extras/CUPTI/lib:${L
 export PYTHON_BIN_PATH="/usr/bin/python"
 export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
-export TF_NEED_OPENCL=0
+export TF_NEED_OPENCL_SYCL=0
 export TF_NEED_MKL=0
 export COMPUTECPP_PATH="/usr/local"
 
diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index 5244898c40..88116d9f24 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -75,17 +75,23 @@ if [[ $1 == "PI_ONE" ]]; then
   PI_COPTS="--copt=-march=armv6 --copt=-mfpu=vfp
   --copt=-DUSE_GEMM_FOR_CONV --copt=-DUSE_OPENBLAS
   --copt=-isystem --copt=${OPENBLAS_INSTALL_PATH}/include/
+  --copt=-std=gnu11 --copt=-DS_IREAD=S_IRUSR --copt=-DS_IWRITE=S_IWUSR
   --linkopt=-L${OPENBLAS_INSTALL_PATH}/lib/
   --linkopt=-l:libopenblas.a"
   echo "Building for the Pi One/Zero, with no NEON support"
 else
   PI_COPTS='--copt=-march=armv7-a --copt=-mfpu=neon-vfpv4
+  --copt=-std=gnu11 --copt=-DS_IREAD=S_IRUSR --copt=-DS_IWRITE=S_IWUSR
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8'
   echo "Building for the Pi Two/Three, with NEON acceleration"
 fi
 
+# We need to pass down the environment variable with a possible alternate Python
+# include path for Python 3.x builds to work.
+export CROSSTOOL_PYTHON_INCLUDE_PATH
+
 cd ${WORKSPACE_PATH}
 bazel build -c opt ${PI_COPTS} \
   --config=monolithic \
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index 6a8b6417d6..6c964c7227 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -117,7 +117,7 @@ function run_configure_for_cpu_build {
   export TF_NEED_VERBS=0
   export TF_NEED_GCP=0
   export TF_NEED_HDFS=0
-  export TF_NEED_OPENCL=0
+  export TF_NEED_OPENCL_SYCL=0
   echo "" | ./configure
 }
 
@@ -141,7 +141,7 @@ function run_configure_for_gpu_build {
   export TF_NEED_MKL=0
   export TF_NEED_GCP=0
   export TF_NEED_HDFS=0
-  export TF_NEED_OPENCL=0
+  export TF_NEED_OPENCL_SYCL=0
 
   # TODO(pcloudy): Remove this after TensorFlow uses its own CRSOOTOOL
   # for GPU build on Windows
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 1a0145b078..20e1dcd085 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -83,11 +83,6 @@ ENV CI_BUILD_PYTHON python
 
 RUN tensorflow/tools/ci_build/builds/configured CPU \
     bazel build -c opt --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
-        # For optimized builds appropriate for the hardware platform of your choosing, uncomment below...
-        # For ivy-bridge or sandy-bridge
-        # --copt=-march="ivybridge" \
-        # for haswell, broadwell, or skylake
-        # --copt=-march="haswell" \
         tensorflow/tools/pip_package:build_pip_package && \
     bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \
     pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7 b/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
index 64ebc4607a..9bcc3925a8 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
@@ -101,12 +101,11 @@ RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/lib
                 --jobs=${TF_AVAILABLE_CPUS} \
                 tensorflow/tools/pip_package:build_pip_package && \
     mkdir /pip_pkg && \
-    bazel-bin/tensorflow/tools/pip_package/build_pip_package /pip_pkg
-
-# Clean up pip wheel and Bazel cache when done.
-RUN pip --no-cache-dir install --upgrade /pip_pkg/tensorflow-*.whl && \
+    bazel-bin/tensorflow/tools/pip_package/build_pip_package /pip_pkg && \
+    pip --no-cache-dir install --upgrade /pip_pkg/tensorflow-*.whl && \
     rm -rf /pip_pkg && \
     rm -rf /root/.cache
+# Clean up pip wheel and Bazel cache when done.
 
 WORKDIR /root
 
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index 0571dd7391..e212d10290 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
+FROM nvidia/cuda:8.0-cudnn6-runtime-ubuntu16.04
 
 LABEL maintainer="Craig Citro <craigcitro@google.com>"
 
diff --git a/tensorflow/tools/docker/README.md b/tensorflow/tools/docker/README.md
index 2e5a0038ed..e35c58ff80 100644
--- a/tensorflow/tools/docker/README.md
+++ b/tensorflow/tools/docker/README.md
@@ -60,6 +60,20 @@ Building TensorFlow Docker containers should be done through the
 script. The raw Dockerfiles should not be used directly as they contain strings
 to be replaced by the script during the build.
 
+Attempting to run [parameterized_docker_build.sh](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/parameterized_docker_build.sh)
+from a binary docker image such as for example `tensorflow/tensorflow:latest` will
+not work. One needs to execute the script from a developer docker image since by
+contrast with a binary docker image it contains not only the compiled solution but
+also the tensorflow source code. Please select the appropriate developer docker
+image of tensorflow at `tensorflow/tensorflow:[.](https://hub.docker.com/r/tensorflow/tensorflow/tags/)`.
+
+The smallest command line to generate a docker image will then be:
+```docker run -it tensorflow/tensorflow:"right_tag"```
+
+If you would like to start a jupyter notebook on your docker container, make sure
+to map the port 8888 of your docker container by adding -p 8888:8888 to the above
+command.
+
 To use the script, specify the container type (`CPU` vs. `GPU`), the desired
 Python version (`PYTHON2` vs. `PYTHON3`) and whether the developer Docker image
 is to be built (`NO` vs. `YES`). In addition, you need to specify the central
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index 1bf7113c9e..9216008600 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -131,6 +131,8 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
+        "//tensorflow/contrib/rnn:gru_ops_op_lib",
+        "//tensorflow/contrib/rnn:lstm_ops_op_lib",
     ] + if_not_windows([
         "//tensorflow/core/kernels:quantized_ops",
         "//tensorflow/core/kernels:remote_fused_graph_rewriter_transform",
diff --git a/tensorflow/tools/graph_transforms/quantize_nodes.cc b/tensorflow/tools/graph_transforms/quantize_nodes.cc
index 2b85e7e83c..97e8f77616 100644
--- a/tensorflow/tools/graph_transforms/quantize_nodes.cc
+++ b/tensorflow/tools/graph_transforms/quantize_nodes.cc
@@ -759,6 +759,7 @@ Status QuantizeNodes(const GraphDef& input_graph_def,
           NodeDef reshape_dims;
           reshape_dims.set_op("Const");
           reshape_dims.set_name(unique_input_name + "/reshape_dims");
+          AddNodeInput("^" + input_name, &reshape_dims);
           SetNodeAttr("dtype", DT_INT32, &reshape_dims);
           Tensor reshape_dims_tensor(DT_INT32, {1});
           reshape_dims_tensor.flat<int32>()(0) = -1;
@@ -768,6 +769,7 @@ Status QuantizeNodes(const GraphDef& input_graph_def,
           NodeDef reduction_dims;
           reduction_dims.set_op("Const");
           reduction_dims.set_name(unique_input_name + "/reduction_dims");
+          AddNodeInput("^" + input_name, &reduction_dims);
           SetNodeAttr("dtype", DT_INT32, &reduction_dims);
           Tensor reduction_dims_tensor(DT_INT32, {1});
           reduction_dims_tensor.flat<int32>()(0) = 0;
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 456c2e2908..0c54300e06 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,7 +29,7 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.4.0-rc1'
+_VERSION = '1.4.0'
 
 REQUIRED_PACKAGES = [
     'absl-py',
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index dfe332b091..afcae6eade 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -1,24 +1,21 @@
 # TensorFlow external dependencies that can be loaded in WORKSPACE files.
 
 load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
-
 load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
 load("//third_party/mkl:build_defs.bzl", "mkl_repository")
-load(
-    "@io_bazel_rules_closure//closure/private:java_import_external.bzl",
-    "java_import_external",
-)
+load("@io_bazel_rules_closure//closure/private:java_import_external.bzl",
+     "java_import_external")
 load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
 load("//third_party/py:python_configure.bzl", "python_configure")
-load(
-    "//third_party/toolchains/cpus/arm:arm_compiler_configure.bzl",
-    "arm_compiler_configure",
-)
+load("//third_party/toolchains/cpus/arm:arm_compiler_configure.bzl",
+     "arm_compiler_configure")
+
 
 def _is_windows(repository_ctx):
   """Returns true if the host operating system is windows."""
   return repository_ctx.os.name.lower().find("windows") != -1
 
+
 def _get_env_var(repository_ctx, name):
   """Find an environment variable."""
   if name in repository_ctx.os.environ:
@@ -26,6 +23,7 @@ def _get_env_var(repository_ctx, name):
   else:
     return None
 
+
 # Parse the bazel version string from `native.bazel_version`.
 def _parse_bazel_version(bazel_version):
   # Remove commit from version.
@@ -41,6 +39,7 @@ def _parse_bazel_version(bazel_version):
     version_tuple += (str(number),)
   return version_tuple
 
+
 # Check that a specific bazel version is being used.
 def check_version(bazel_version):
   if "bazel_version" not in dir(native):
@@ -57,9 +56,11 @@ def check_version(bazel_version):
       fail("\nCurrent Bazel version is {}, expected at least {}\n".format(
           native.bazel_version, bazel_version))
 
+
 def _repos_are_siblings():
   return Label("@foo//bar").workspace_root.startswith("../")
 
+
 # Temporary workaround to support including TensorFlow as a submodule until this
 # use-case is supported in the next Bazel release.
 def _temp_workaround_http_archive_impl(repo_ctx):
@@ -72,7 +73,9 @@ def _temp_workaround_http_archive_impl(repo_ctx):
   if repo_ctx.attr.patch_file != None:
     _apply_patch(repo_ctx, repo_ctx.attr.patch_file)
 
+
 temp_workaround_http_archive = repository_rule(
+    implementation = _temp_workaround_http_archive_impl,
     attrs = {
         "build_file": attr.label(),
         "repository": attr.string(),
@@ -81,7 +84,6 @@ temp_workaround_http_archive = repository_rule(
         "sha256": attr.string(default = ""),
         "strip_prefix": attr.string(default = ""),
     },
-    implementation = _temp_workaround_http_archive_impl,
 )
 
 # Executes specified command with arguments and calls 'fail' if it exited with
@@ -93,6 +95,7 @@ def _execute_and_check_ret_code(repo_ctx, cmd_and_args):
           + "Stderr: {3}").format(" ".join(cmd_and_args), result.return_code,
                                   result.stdout, result.stderr))
 
+
 # Apply a patch_file to the repository root directory
 # Runs 'patch -p1'
 def _apply_patch(repo_ctx, patch_file):
@@ -110,6 +113,7 @@ def _apply_patch(repo_ctx, patch_file):
     cmd = [bazel_sh, "-c", " ".join(cmd)]
   _execute_and_check_ret_code(repo_ctx, cmd)
 
+
 # Download the repository and apply a patch to its root
 def _patched_http_archive_impl(repo_ctx):
   repo_ctx.download_and_extract(
@@ -118,7 +122,9 @@ def _patched_http_archive_impl(repo_ctx):
       stripPrefix=repo_ctx.attr.strip_prefix)
   _apply_patch(repo_ctx, repo_ctx.attr.patch_file)
 
+
 patched_http_archive = repository_rule(
+    implementation = _patched_http_archive_impl,
     attrs = {
         "patch_file": attr.label(),
         "build_file": attr.label(),
@@ -127,9 +133,9 @@ patched_http_archive = repository_rule(
         "sha256": attr.string(default = ""),
         "strip_prefix": attr.string(default = ""),
     },
-    implementation = _patched_http_archive_impl,
 )
 
+
 # If TensorFlow is linked as a submodule.
 # path_prefix is no longer used.
 # tf_repo_name is thought to be under consideration.
@@ -442,11 +448,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.http_archive(
       name = "nsync",
       urls = [
-          "https://mirror.bazel.build/github.com/google/nsync/archive/93815892dddafe9146a5f7e7042281d59d0f4323.tar.gz",
-          # "https://github.com/google/nsync/archive/93815892dddafe9146a5f7e7042281d59d0f4323.tar.gz",
+          "https://mirror.bazel.build/github.com/google/nsync/archive/4fc8ff3e7626c5f24bc9674438d8257f0ffc226c.tar.gz",
+          # "https://github.com/google/nsync/archive/4fc8ff3e7626c5f24bc9674438d8257f0ffc226c.tar.gz",
       ],
-      sha256 = "e3bd4555415ace511338fc27e595351738eea4e9006f1612b76c82914770716b",
-      strip_prefix = "nsync-93815892dddafe9146a5f7e7042281d59d0f4323",
+      sha256 = "ffbbe828f3d0bef75462e34801de5cea31d10aa63eaa42a4ed74c46521bdfd58",
+      strip_prefix = "nsync-4fc8ff3e7626c5f24bc9674438d8257f0ffc226c",
   )
 
   native.http_archive(
@@ -815,12 +821,3 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
           "https://github.com/google/flatbuffers/archive/971a68110e4fc1bace10fcb6deeb189e7e1a34ce.tar.gz",
       ],
   )
-
-  native.new_http_archive(
-      name = "tflite_mobilenet",
-      build_file = str(Label("//third_party:tflite_mobilenet.BUILD")),
-      sha256 = "eb71679d23a0cbdb173b36ea39f3d3096de0a9b0410d148a8237f20cc1157a61",
-      urls = [
-          "https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_1.0_224_quantized_2017_11_01.zip"
-      ],
-  )
diff --git a/third_party/aws.BUILD b/third_party/aws.BUILD
index bc6a2fd8cc..bc9e37ffb3 100644
--- a/third_party/aws.BUILD
+++ b/third_party/aws.BUILD
@@ -21,6 +21,9 @@ cc_library(
         "@%ws%//tensorflow:linux_ppc64le": glob([
             "aws-cpp-sdk-core/source/platform/linux-shared/*.cpp",
         ]),
+        "@%ws%//tensorflow:raspberry_pi_armeabi": glob([
+            "aws-cpp-sdk-core/source/platform/linux-shared/*.cpp",
+        ]),
         "//conditions:default": [],
     }) + glob([
         "aws-cpp-sdk-core/include/**/*.h",
diff --git a/third_party/boringssl/add_boringssl_s390x.patch b/third_party/boringssl/add_boringssl_s390x.patch
deleted file mode 100644
index 8b42d10e68..0000000000
--- a/third_party/boringssl/add_boringssl_s390x.patch
+++ /dev/null
@@ -1,133 +0,0 @@
-diff --git a/src/include/openssl/base.h b/src/include/openssl/base.h
-index 7a3adfb..88012ad 100644
---- a/src/include/openssl/base.h
-+++ b/src/include/openssl/base.h
-@@ -94,6 +94,8 @@ extern "C" {
- #define OPENSSL_PNACL
- #elif defined(__myriad2__)
- #define OPENSSL_32_BIT
-+#elif defined(__s390x__)
-+#define OPENSSL_64_BIT
- #else
- #error "Unknown target CPU"
- #endif
-diff --git a/BUILD b/BUILD
-index 6b645e61..c90b7beb 100644
---- a/BUILD
-+++ b/BUILD
-@@ -40,29 +40,46 @@ config_setting(
-     values = {"cpu": "darwin"},
- )
- 
--boringssl_copts = [
--    # Assembler option --noexecstack adds .note.GNU-stack to each object to
--    # ensure that binaries can be built with non-executable stack.
--    "-Wa,--noexecstack",
--
--    # This is needed on Linux systems (at least) to get rwlock in pthread.
--    "-D_XOPEN_SOURCE=700",
--
--    # This list of warnings should match those in the top-level CMakeLists.txt.
--    "-Wall",
--    "-Werror",
--    "-Wformat=2",
--    "-Wsign-compare",
--    "-Wmissing-field-initializers",
--    "-Wwrite-strings",
--    "-Wshadow",
--    "-fno-common",
--
--    # Modern build environments should be able to set this to use atomic
--    # operations for reference counting rather than locks. However, it's
--    # known not to work on some Android builds.
--    # "-DOPENSSL_C11_ATOMIC",
--] + select({
-+config_setting(
-+    name = "windows",
-+    values = {"cpu": "x64_windows"},
-+    visibility = ["//visibility:public"],
-+)
-+
-+config_setting(
-+    name = "windows_msvc",
-+    values = {"cpu": "x64_windows_msvc"},
-+    visibility = ["//visibility:public"],
-+)
-+
-+boringssl_copts = select({
-+    ":windows": [
-+        "-DWIN32_LEAN_AND_MEAN",
-+    ],
-+    "//conditions:default": [
-+        # Assembler option --noexecstack adds .note.GNU-stack to each object to
-+        # ensure that binaries can be built with non-executable stack.
-+        "-Wa,--noexecstack",
-+
-+        # This is needed on Linux systems (at least) to get rwlock in pthread.
-+        "-D_XOPEN_SOURCE=700",
-+
-+        # This list of warnings should match those in the top-level CMakeLists.txt.
-+        "-Wall",
-+        "-Werror",
-+        "-Wformat=2",
-+        "-Wsign-compare",
-+        "-Wmissing-field-initializers",
-+        "-Wwrite-strings",
-+        "-Wshadow",
-+        "-fno-common",
-+
-+        # Modern build environments should be able to set this to use atomic
-+        # operations for reference counting rather than locks. However, it's
-+        # known not to work on some Android builds.
-+        # "-DOPENSSL_C11_ATOMIC",
-+    ],
-+}) + select({
-     ":linux_x86_64": [],
-     ":mac_x86_64": [],
-     "//conditions:default": ["-DOPENSSL_NO_ASM"],
-@@ -75,18 +92,26 @@ crypto_sources_asm = select({
- })
- 
- # For C targets only (not C++), compile with C11 support.
--boringssl_copts_c11 = boringssl_copts + [
--    "-std=c11",
--    "-Wmissing-prototypes",
--    "-Wold-style-definition",
--    "-Wstrict-prototypes",
--]
-+boringssl_copts_c11 = boringssl_copts + select({
-+    ":windows": [],
-+    ":windows_msvc": [],
-+    "//conditions:default": [
-+        "-std=c11",
-+        "-Wmissing-prototypes",
-+        "-Wold-style-definition",
-+        "-Wstrict-prototypes",
-+    ],
-+})
- 
- # For C targets only (not C++), compile with C11 support.
--boringssl_copts_cxx = boringssl_copts + [
--    "-std=c++11",
--    "-Wmissing-declarations",
--]
-+boringssl_copts_cxx = boringssl_copts + select({
-+    ":windows": [],
-+    ":windows_msvc": [],
-+    "//conditions:default": [
-+        "-std=c++11",
-+        "-Wmissing-declarations",
-+    ],
-+})
- 
- cc_library(
-     name = "crypto",
-@@ -96,6 +121,8 @@ cc_library(
-     includes = ["src/include"],
-     linkopts = select({
-         ":mac_x86_64": [],
-+        ":windows": [],
-+        ":windows_msvc": [],
-         "//conditions:default": ["-lpthread"],
-     }),
-     visibility = ["//visibility:public"],
diff --git a/third_party/curl.BUILD b/third_party/curl.BUILD
index 882967df1c..805a30d262 100644
--- a/third_party/curl.BUILD
+++ b/third_party/curl.BUILD
@@ -477,7 +477,6 @@ genrule(
         "#  define HAVE_RAND_EGD 1",
         "#  define HAVE_RAND_STATUS 1",
         "#  define HAVE_SSL_GET_SHUTDOWN 1",
-        "#  define HAVE_STROPTS_H 1",
         "#  define HAVE_TERMIOS_H 1",
         "#  define OS \"x86_64-pc-linux-gnu\"",
         "#  define RANDOM_FILE \"/dev/urandom\"",
diff --git a/third_party/nanopb.BUILD b/third_party/nanopb.BUILD
deleted file mode 100644
index d21866911b..0000000000
--- a/third_party/nanopb.BUILD
+++ /dev/null
@@ -1,23 +0,0 @@
-# Description:
-#   Nanopb, a tiny ANSI C protobuf implementation for use on embedded devices.
-
-licenses(["notice"])  # zlib license
-
-exports_files(["LICENSE.txt"])
-
-cc_library(
-    name = "nanopb",
-    srcs = [
-        "pb_common.c",
-        "pb_decode.c",
-        "pb_encode.c",
-    ],
-    hdrs = [
-        "pb.h",
-        "pb_common.h",
-        "pb_decode.h",
-        "pb_encode.h",
-    ],
-    includes = ["."],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/sycl/crosstool/CROSSTOOL.tpl b/third_party/sycl/crosstool/CROSSTOOL.tpl
index 32884d71e7..f8e50efcc6 100755
--- a/third_party/sycl/crosstool/CROSSTOOL.tpl
+++ b/third_party/sycl/crosstool/CROSSTOOL.tpl
@@ -35,10 +35,10 @@ toolchain {
   tool_path { name: "compat-ld" path: "/usr/bin/ld" }
   tool_path { name: "cpp" path: "/usr/bin/cpp" }
   tool_path { name: "dwp" path: "/usr/bin/dwp" }
-  tool_path { name: "gcc" path: "computecpp" }
+  tool_path { name: "gcc" path: "%{sycl_impl}" }
   # Use "-std=c++11" for nvcc. For consistency, force both the host compiler
   # and the device compiler to use "-std=c++11".
-  cxx_flag: "-std=c++11"
+  cxx_flag: "%{c++_std}"
   linker_flag: "-Wl,-no-as-needed"
   linker_flag: "-lstdc++"
   linker_flag: "-B/usr/bin/"
@@ -53,7 +53,7 @@ toolchain {
   cxx_builtin_include_directory: "/usr/local/include"
   cxx_builtin_include_directory: "/usr/include"
 
-  cxx_builtin_include_directory: "%{computecpp_toolkit_path}"
+  cxx_builtin_include_directory: "%{sycl_include_dir}"
   cxx_builtin_include_directory: "%{python_lib_path}"
 
   tool_path { name: "gcov" path: "/usr/bin/gcov" }
@@ -214,4 +214,4 @@ toolchain {
     compiler_flag: "-O2"
     compiler_flag: "-DNDEBUG"
   }
-}
+}
+\ No newline at end of file
diff --git a/third_party/sycl/crosstool/trisycl.tpl b/third_party/sycl/crosstool/trisycl.tpl
new file mode 100644
index 0000000000..b470772fbf
--- /dev/null
+++ b/third_party/sycl/crosstool/trisycl.tpl
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import tempfile
+from subprocess import call
+
+CPU_CXX_COMPILER = ('%{host_cxx_compiler}')
+CPU_C_COMPILER = ('%{host_c_compiler}')
+
+CURRENT_DIR = os.path.dirname(sys.argv[0])
+TRISYCL_INCLUDE_DIR = CURRENT_DIR + '/../sycl/include'
+
+def main():
+  compiler_flags = []
+
+  remove_flags = ('-Wl,--no-undefined', '-Wno-unused-but-set-variable', '-Wignored-attributes', '-fno-exceptions')
+  # remove -fsamotoze-coverage from string with g++
+  if 'g++' in CPU_CXX_COMPILER:
+    remove_flags += ('-fsanitize-coverage',)
+    compiler_flags += ['-fopenmp']
+  else:
+    compiler_flags += ['-fopenmp=libomp']
+
+  compiler_flags += [flag for flag in sys.argv[1:] if not flag.startswith(remove_flags)]
+
+
+  output_file_index = compiler_flags.index('-o') + 1
+  output_file_name = compiler_flags[output_file_index]
+
+  if(output_file_index == 1):
+    # we are linking
+    return call([CPU_CXX_COMPILER] + compiler_flags +
+                ['-Wl,--no-undefined'])
+
+  # find what we compile
+  compiling_cpp = 0
+  if('-c' in compiler_flags):
+      compiled_file_index = compiler_flags.index('-c') + 1
+      compiled_file_name = compiler_flags[compiled_file_index]
+      if(compiled_file_name.endswith(('.cc', '.c++', '.cpp', '.CPP',
+                                      '.C', '.cxx'))):
+        compiling_cpp = 1;
+
+  debug_flags = ['-DTRISYCL_DEBUG', '-DBOOST_LOG_DYN_LINK', '-DTRISYCL_TRACE_KERNEL', '-lpthread', '-lboost_log', '-g', '-rdynamic']
+
+  opt_flags = ['-DNDEBUG', '-DBOOST_DISABLE_ASSERTS', '-O3']
+
+  compiler_flags = compiler_flags + ['-DEIGEN_USE_SYCL=1',
+                                     '-DEIGEN_HAS_C99_MATH',
+                                     '-DEIGEN_MAX_ALIGN_BYTES=16',
+                                     '-DTENSORFLOW_USE_SYCL'] + opt_flags
+
+  if(compiling_cpp == 1):
+    # create a blacklist of folders that will be skipped when compiling
+    # with triSYCL
+    skip_extensions = [".cu.cc"]
+    skip_folders = ["tensorflow/compiler", "tensorflow/docs_src", "tensorflow/tensorboard", "third_party", "external", "hexagon"]
+    skip_folders = [(folder + '/') for folder in skip_folders]
+    # if compiling external project skip triSYCL
+    if any(compiled_file_name.endswith(_ext) for _ext in skip_extensions) or any(_folder in output_file_name for _folder in skip_folders):
+      return call([CPU_CXX_COMPILER] + compiler_flags)
+
+    host_compiler_flags = ['-xc++', '-Wno-unused-variable',
+                           '-I', TRISYCL_INCLUDE_DIR] + compiler_flags
+    x = call([CPU_CXX_COMPILER] + host_compiler_flags)
+    return x
+  else:
+    # compile for C
+    return call([CPU_C_COMPILER] + compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/sycl/sycl/BUILD.tpl b/third_party/sycl/sycl/BUILD.tpl
index 6cad190630..b6ceaadda7 100755
--- a/third_party/sycl/sycl/BUILD.tpl
+++ b/third_party/sycl/sycl/BUILD.tpl
@@ -10,16 +10,27 @@ package(default_visibility = ["//visibility:public"])
 exports_files(["LICENSE.text"])
 
 config_setting(
-    name = "using_sycl",
-    values = {
-        "define": "using_sycl=true",
+    name = "using_sycl_ccpp",
+    define_values = {
+        "using_sycl": "true",
+        "using_trisycl": "false",
     },
 )
 
+config_setting(
+    name = "using_sycl_trisycl",
+    define_values = {
+        "using_sycl": "true",
+        "using_trisycl": "false",
+    },
+)
+
+
 cc_library(
     name = "sycl_headers",
     hdrs = glob([
         "**/*.h",
+        "**/*.hpp",
     ]),
     includes = [".", "include"],
 )
diff --git a/third_party/sycl/sycl/build_defs.bzl.tpl b/third_party/sycl/sycl/build_defs.bzl.tpl
index 09bef0a661..33386f8957 100755
--- a/third_party/sycl/sycl/build_defs.bzl.tpl
+++ b/third_party/sycl/sycl/build_defs.bzl.tpl
@@ -5,9 +5,24 @@ def if_sycl(if_true, if_false = []):
 
     Returns a select statement which evaluates to if_true if we're building
     with SYCL enabled.  Otherwise, the select statement evaluates to if_false.
+    If we are building with triSYCL instead of ComputeCPP, a list with
+    the first element of if_true is returned.
+    """
+    return select({
+        "@local_config_sycl//sycl:using_sycl_ccpp": if_true,
+        "@local_config_sycl//sycl:using_sycl_trisycl": if_true[0:1],
+        "//conditions:default": if_false
+    })
+
+def if_ccpp(if_true, if_false = []):
+    """Shorthand for select()'ing if we are building with ComputeCPP.
 
+    Returns a select statement which evaluates to if_true if we're building
+    with ComputeCPP enabled. Otherwise, the select statement evaluates
+    to if_false.
     """
     return select({
-        "@local_config_sycl//sycl:using_sycl": if_true,
+        "@local_config_sycl//sycl:using_sycl_ccpp": if_true,
+        "@local_config_sycl//sycl:using_sycl_trisycl": if_false,
         "//conditions:default": if_false
     })
diff --git a/third_party/sycl/sycl_configure.bzl b/third_party/sycl/sycl_configure.bzl
index 7af063178e..a0c9e4e43a 100644
--- a/third_party/sycl/sycl_configure.bzl
+++ b/third_party/sycl/sycl_configure.bzl
@@ -5,20 +5,26 @@
   * HOST_CXX_COMPILER:  The host C++ compiler
   * HOST_C_COMPILER:    The host C compiler
   * COMPUTECPP_TOOLKIT_PATH: The path to the ComputeCpp toolkit.
+  * TRISYCL_INCLUDE_DIR: The path to the include directory of triSYCL.
+                         (if using triSYCL instead of ComputeCPP)
   * PYTHON_LIB_PATH: The path to the python lib
 """
 
 _HOST_CXX_COMPILER = "HOST_CXX_COMPILER"
 _HOST_C_COMPILER= "HOST_C_COMPILER"
 _COMPUTECPP_TOOLKIT_PATH = "COMPUTECPP_TOOLKIT_PATH"
+_TRISYCL_INCLUDE_DIR = "TRISYCL_INCLUDE_DIR"
 _PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
 
 def _enable_sycl(repository_ctx):
-  if "TF_NEED_OPENCL" in repository_ctx.os.environ:
-    enable_sycl = repository_ctx.os.environ["TF_NEED_OPENCL"].strip()
+  if "TF_NEED_OPENCL_SYCL" in repository_ctx.os.environ:
+    enable_sycl = repository_ctx.os.environ["TF_NEED_OPENCL_SYCL"].strip()
     return enable_sycl == "1"
   return False
 
+def _enable_compute_cpp(repository_ctx):
+  return _COMPUTECPP_TOOLKIT_PATH in repository_ctx.os.environ
+
 def auto_configure_fail(msg):
   """Output failure message when auto configuration fails."""
   red = "\033[0;31m"
@@ -59,6 +65,15 @@ def find_computecpp_root(repository_ctx):
     return sycl_name
   fail("Cannot find SYCL compiler, please correct your path")
 
+def find_trisycl_include_dir(repository_ctx):
+  """Find triSYCL include directory. """
+  sycl_name = ""
+  if _TRISYCL_INCLUDE_DIR in repository_ctx.os.environ:
+    sycl_name = repository_ctx.os.environ[_TRISYCL_INCLUDE_DIR].strip()
+    if sycl_name.startswith("/"):
+      return sycl_name
+  fail( "Cannot find triSYCL include directory, please correct your path")
+
 def find_python_lib(repository_ctx):
   """Returns python path."""
   if _PYTHON_LIB_PATH in repository_ctx.os.environ:
@@ -171,26 +186,53 @@ def _sycl_autoconf_imp(repository_ctx):
     _tpl(repository_ctx, "sycl:platform.bzl")
     _tpl(repository_ctx, "crosstool:BUILD")
     _file(repository_ctx, "sycl:LICENSE.text")
-    _tpl(repository_ctx, "crosstool:computecpp",
-    {
-      "%{host_cxx_compiler}" : find_cc(repository_ctx),
-      "%{host_c_compiler}" : find_c(repository_ctx),
-    })
-
-    computecpp_root = find_computecpp_root(repository_ctx)
-    _check_dir(repository_ctx, computecpp_root)
-
-    _tpl(repository_ctx, "crosstool:CROSSTOOL",
-    {
-      "%{computecpp_toolkit_path}" : computecpp_root,
-      "%{python_lib_path}" : find_python_lib(repository_ctx),
-    })
-
-    # symlink libraries
-    _check_lib(repository_ctx, computecpp_root+"/lib", "libComputeCpp.so" )
-    _symlink_dir(repository_ctx, computecpp_root + "/lib", "sycl/lib")
-    _symlink_dir(repository_ctx, computecpp_root + "/include", "sycl/include")
-    _symlink_dir(repository_ctx, computecpp_root + "/bin", "sycl/bin")
+
+    if _enable_compute_cpp(repository_ctx):
+      _tpl(repository_ctx, "crosstool:computecpp",
+      {
+        "%{host_cxx_compiler}" : find_cc(repository_ctx),
+        "%{host_c_compiler}" : find_c(repository_ctx)
+      })
+
+      computecpp_root = find_computecpp_root(repository_ctx);
+      _check_dir(repository_ctx, computecpp_root)
+
+      _tpl(repository_ctx, "crosstool:CROSSTOOL",
+      {
+        "%{sycl_include_dir}" : computecpp_root,
+        "%{sycl_impl}" : "computecpp",
+        "%{c++_std}" : "-std=c++11",
+        "%{python_lib_path}" : find_python_lib(repository_ctx),
+      })
+
+      # symlink libraries
+      _check_lib(repository_ctx, computecpp_root+"/lib", "libComputeCpp.so" )
+      _symlink_dir(repository_ctx, computecpp_root + "/lib", "sycl/lib")
+      _symlink_dir(repository_ctx, computecpp_root + "/include", "sycl/include")
+      _symlink_dir(repository_ctx, computecpp_root + "/bin", "sycl/bin")
+    else:
+
+      trisycl_include_dir = find_trisycl_include_dir(repository_ctx);
+      _check_dir(repository_ctx, trisycl_include_dir)
+
+      _tpl(repository_ctx, "crosstool:trisycl",
+      {
+        "%{host_cxx_compiler}" : find_cc(repository_ctx),
+        "%{host_c_compiler}" : find_c(repository_ctx),
+        "%{trisycl_include_dir}" : trisycl_include_dir
+      })
+
+
+      _tpl(repository_ctx, "crosstool:CROSSTOOL",
+      {
+        "%{sycl_include_dir}" : trisycl_include_dir,
+        "%{sycl_impl}" : "trisycl",
+        "%{c++_std}" : "-std=c++1y",
+        "%{python_lib_path}" : find_python_lib(repository_ctx),
+      })
+
+      _symlink_dir(repository_ctx, trisycl_include_dir, "sycl/include")
+
 
 sycl_configure = repository_rule(
   implementation = _sycl_autoconf_imp,
diff --git a/third_party/tflite_mobilenet.BUILD b/third_party/tflite_mobilenet.BUILD
deleted file mode 100644
index 75663eff48..0000000000
--- a/third_party/tflite_mobilenet.BUILD
+++ /dev/null
@@ -1,13 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-filegroup(
-    name = "model_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "BUILD",
-        ],
-    ),
-)
diff --git a/third_party/zlib.BUILD b/third_party/zlib.BUILD
index 8509668891..d164ee719c 100644
--- a/third_party/zlib.BUILD
+++ b/third_party/zlib.BUILD
@@ -49,7 +49,7 @@ cc_library(
         ":windows_msvc": [],
         "//conditions:default": [
             "-Wno-shift-negative-value",
-            "-Wno-implicit-function-declaration",
+            "-DZ_HAVE_UNISTD_H",
         ],
     }),
     includes = ["."],
diff --git a/tools/bazel.rc b/tools/bazel.rc
index 414ddf2e47..f609efe188 100644
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -9,13 +9,16 @@ build:win-cuda --define=using_cuda=true --define=using_cuda_nvcc=true
 build:mkl --define=using_mkl=true
 
 build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
-build:sycl --define=using_sycl=true
+build:sycl --define=using_sycl=true --define=using_trisycl=false
 
 build:sycl_nodouble --crosstool_top=@local_config_sycl//crosstool:toolchain
 build:sycl_nodouble --define=using_sycl=true --cxxopt -DTENSORFLOW_SYCL_NO_DOUBLE
 
 build:sycl_asan --crosstool_top=@local_config_sycl//crosstool:toolchain
-build:sycl_asan --define=using_sycl=true --copt -fno-omit-frame-pointer --copt -fsanitize-coverage=3 --copt -DGPR_NO_DIRECT_SYSCALLS --linkopt -fPIC --linkopt -fsanitize=address
+build:sycl_asan --define=using_sycl=true --define=using_trisycl=false --copt -fno-omit-frame-pointer --copt -fsanitize-coverage=3 --copt -DGPR_NO_DIRECT_SYSCALLS --linkopt -fPIC --linkopt -fsanitize=address
+
+build:sycl_trisycl --crosstool_top=@local_config_sycl//crosstool:toolchain
+build:sycl_trisycl --define=using_sycl=true --define=using_trisycl=true
 
 build --define=use_fast_cpp_protos=true
 build --define=allow_oversize_protos=true
author	Martin Wicke <martin.wicke@gmail.com>	2017-11-10 12:26:11 -0800
committer	GitHub <noreply@github.com>	2017-11-10 12:26:11 -0800
commit	d0a5d885d61b837018cb931a4d577289acc826fc (patch)
tree	dd344e45c4eca857c02746ef50d990a9cd81ea69
parent	047d7965d2877d7b55f4cdb0d0abdcd733f266a9 (diff)